28
28
#ifdef HAVE_LANGINFO_CODESET
29
29
#include <langinfo.h>
32
34
#include "libjnlib-config.h"
33
35
#include "stringhelp.h"
34
36
#include "utf8conv.h"
37
static ushort koi8_unicode[128] = {
38
0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524,
39
0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
40
0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248,
41
0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7,
42
0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556,
43
0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e,
44
0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565,
45
0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9,
46
0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433,
47
0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e,
48
0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432,
49
0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a,
50
0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413,
51
0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e,
52
0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412,
53
0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a
56
static ushort latin2_unicode[128] = {
57
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
58
0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
59
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
60
0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
61
0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
62
0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
63
0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
64
0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
65
0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
66
0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
67
0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
68
0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
69
0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
70
0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
71
0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
72
0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9
76
42
static const char *active_charset_name = "iso-8859-1";
77
static ushort *active_charset = NULL;
78
static int no_translation = 0;
43
static unsigned short *active_charset;
44
static int no_translation; /* Set to true if we let simply pass through. */
45
static int use_iconv; /* iconv comversion fucntions required. */
49
/* Error handler for iconv failures. This is needed to not clutter the
50
output with repeated diagnostics about a missing conversion. */
52
handle_iconv_error (const char *to, const char *from, int use_fallback)
56
static int shown1, shown2;
59
if (to && !strcmp (to, "utf-8"))
71
log_info (_("conversion from `%s' to `%s' not available\n"),
79
log_info (_("iconv_open failed: %s\n"), strerror (errno));
85
/* To avoid further error messages we fallback to Latin-1 for the
86
native encoding. This is justified as one can expect that on a
87
utf-8 enabled system nl_langinfo() will work and thus we won't
88
never get to here. Thus Latin-1 seems to be a reasonable
90
active_charset_name = "iso-8859-1";
92
active_charset = NULL;
81
99
set_native_charset (const char *newset)
101
const char *full_newset;
105
#ifdef HABE_W32_SYSTEM
106
static char codepage[30];
110
/* We are a console program thus we need to use the
111
GetConsoleOutputCP function and not the the GetACP which
112
would give the codepage for a GUI program. Note this is not
113
a bulletproof detection because GetConsoleCP might return a
114
different one for console input. Not sure how to cope with
115
that. If the console Code page is not known we fall back to
116
the system code page. */
117
cpno = GetConsoleOutputCP ();
120
sprintf (codepage, "CP%u", cpno );
121
/* Resolve alias. We use a long string string and not the usual
122
array to optimize if the code is taken to a DSO. Taken from
125
for (aliases = ("CP936" "\0" "GBK" "\0"
126
"CP1361" "\0" "JOHAB" "\0"
127
"CP20127" "\0" "ASCII" "\0"
128
"CP20866" "\0" "KOI8-R" "\0"
129
"CP21866" "\0" "KOI8-RU" "\0"
130
"CP28591" "\0" "ISO-8859-1" "\0"
131
"CP28592" "\0" "ISO-8859-2" "\0"
132
"CP28593" "\0" "ISO-8859-3" "\0"
133
"CP28594" "\0" "ISO-8859-4" "\0"
134
"CP28595" "\0" "ISO-8859-5" "\0"
135
"CP28596" "\0" "ISO-8859-6" "\0"
136
"CP28597" "\0" "ISO-8859-7" "\0"
137
"CP28598" "\0" "ISO-8859-8" "\0"
138
"CP28599" "\0" "ISO-8859-9" "\0"
139
"CP28605" "\0" "ISO-8859-15" "\0"
140
"CP65001" "\0" "UTF-8" "\0");
142
aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
144
if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1]))
146
newset = aliases + strlen (aliases) + 1;
151
#else /*!HAVE_W32_SYSTEM*/
84
153
#ifdef HAVE_LANGINFO_CODESET
85
newset = nl_langinfo (CODESET);
154
newset = nl_langinfo (CODESET);
155
#else /*!HAVE_LANGINFO_CODESET*/
156
/* Try to get the used charset from environment variables. */
157
static char codepage[30];
158
const char *lc, *dot, *mod;
160
strcpy (codepage, "iso-8859-1");
161
lc = getenv ("LC_ALL");
164
lc = getenv ("LC_CTYPE");
166
lc = getenv ("LANG");
170
dot = strchr (lc, '.');
173
mod = strchr (++dot, '@');
175
mod = dot + strlen (dot);
176
if (mod - dot < sizeof codepage && dot != mod)
178
memcpy (codepage, dot, mod - dot);
179
codepage [mod - dot] = 0;
184
#endif /*!HAVE_LANGINFO_CODESET*/
185
#endif /*!HAVE_W32_SYSTEM*/
188
full_newset = newset;
90
189
if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3))
93
192
if (*newset == '-' || *newset == '_')
98
|| !ascii_strcasecmp (newset, "8859-1")
99
|| !ascii_strcasecmp (newset, "8859-15"))
196
/* Note that we silently assume that plain ASCII is actually meant
197
as Latin-1. This makes sense because many Unix system don't have
198
their locale set up properly and thus would get annoying error
199
messages and we have to handle all the "bug" reports. Latin-1 has
200
always been the character set used for 8 bit characters on Unix
203
|| !ascii_strcasecmp (newset, "8859-1" )
204
|| !ascii_strcasecmp (newset, "646" )
205
|| !ascii_strcasecmp (newset, "ASCII" )
206
|| !ascii_strcasecmp (newset, "ANSI_X3.4-1968" )
101
209
active_charset_name = "iso-8859-1";
102
210
no_translation = 0;
103
211
active_charset = NULL;
105
else if (!ascii_strcasecmp (newset, "8859-2"))
107
active_charset_name = "iso-8859-2";
109
active_charset = latin2_unicode;
111
else if (!ascii_strcasecmp (newset, "koi8-r"))
113
active_charset_name = "koi8-r";
115
active_charset = koi8_unicode;
117
else if (!ascii_strcasecmp (newset, "utf8")
118
|| !ascii_strcasecmp (newset, "utf-8"))
214
else if ( !ascii_strcasecmp (newset, "utf8" )
215
|| !ascii_strcasecmp(newset, "utf-8") )
120
217
active_charset_name = "utf-8";
121
218
no_translation = 1;
122
219
active_charset = NULL;
226
#ifdef HAVE_W32_SYSTEM
227
if (load_libiconv ())
229
#endif /*HAVE_W32_SYSTEM*/
231
cd = iconv_open (full_newset, "utf-8");
232
if (cd == (iconv_t)-1)
234
handle_iconv_error (full_newset, "utf-8", 0);
238
cd = iconv_open ("utf-8", full_newset);
239
if (cd == (iconv_t)-1)
241
handle_iconv_error ("utf-8", full_newset, 0);
245
active_charset_name = full_newset;
247
active_charset = NULL;