1
/////////////////////////////////////////////////////////////////////////////
3
// Purpose: Unicode conversion classes
4
// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5
// Ryan Norton, Fredrik Roubert (UTF7)
8
// RCS-ID: $Id: strconv.cpp,v 1.160.2.2 2006/01/18 16:32:46 JS Exp $
9
// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10
// (c) 2000-2003 Vadim Zeitlin
11
// (c) 2004 Ryan Norton, Fredrik Roubert
12
// Licence: wxWindows licence
13
/////////////////////////////////////////////////////////////////////////////
15
// ============================================================================
17
// ============================================================================
19
// ----------------------------------------------------------------------------
21
// ----------------------------------------------------------------------------
23
#if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24
#pragma implementation "strconv.h"
27
// For compilers that support precompilation, includes "wx.h".
28
#include "wx/wxprec.h"
39
#include "wx/strconv.h"
44
#include "wx/msw/private.h"
45
#include "wx/msw/missing.h"
56
#if defined(__WIN32__) && !defined(__WXMICROWIN__)
57
#define wxHAVE_WIN32_MB2WC
58
#endif // __WIN32__ but !__WXMICROWIN__
60
// ----------------------------------------------------------------------------
62
// ----------------------------------------------------------------------------
70
#include "wx/thread.h"
73
#include "wx/encconv.h"
74
#include "wx/fontmap.h"
79
#include <ATSUnicode.h>
80
#include <TextCommon.h>
81
#include <TextEncodingConverter.h>
84
#include "wx/mac/private.h" // includes mac headers
87
#define TRACE_STRCONV _T("strconv")
89
// ----------------------------------------------------------------------------
91
// ----------------------------------------------------------------------------
93
#define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
94
#define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
96
#if SIZEOF_WCHAR_T == 4
97
#define WC_NAME "UCS4"
98
#define WC_BSWAP BSWAP_UCS4
99
#ifdef WORDS_BIGENDIAN
100
#define WC_NAME_BEST "UCS-4BE"
102
#define WC_NAME_BEST "UCS-4LE"
104
#elif SIZEOF_WCHAR_T == 2
105
#define WC_NAME "UTF16"
106
#define WC_BSWAP BSWAP_UTF16
108
#ifdef WORDS_BIGENDIAN
109
#define WC_NAME_BEST "UTF-16BE"
111
#define WC_NAME_BEST "UTF-16LE"
113
#else // sizeof(wchar_t) != 2 nor 4
114
// does this ever happen?
115
#error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
118
// ============================================================================
120
// ============================================================================
122
// ----------------------------------------------------------------------------
123
// UTF-16 en/decoding to/from UCS-4
124
// ----------------------------------------------------------------------------
127
static size_t encode_utf16(wxUint32 input, wxUint16 *output)
132
*output = (wxUint16) input;
135
else if (input>=0x110000)
143
*output++ = (wxUint16) ((input >> 10)+0xd7c0);
144
*output = (wxUint16) ((input&0x3ff)+0xdc00);
150
static size_t decode_utf16(const wxUint16* input, wxUint32& output)
152
if ((*input<0xd800) || (*input>0xdfff))
157
else if ((input[1]<0xdc00) || (input[1]>0xdfff))
164
output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
170
// ----------------------------------------------------------------------------
172
// ----------------------------------------------------------------------------
174
wxMBConv::~wxMBConv()
176
// nothing to do here (necessary for Darwin linking probably)
179
const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
183
// calculate the length of the buffer needed first
184
size_t nLen = MB2WC(NULL, psz, 0);
185
if ( nLen != (size_t)-1 )
187
// now do the actual conversion
188
wxWCharBuffer buf(nLen);
189
nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
190
if ( nLen != (size_t)-1 )
197
wxWCharBuffer buf((wchar_t *)NULL);
202
const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
206
size_t nLen = WC2MB(NULL, pwz, 0);
207
if ( nLen != (size_t)-1 )
209
wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
210
nLen = WC2MB(buf.data(), pwz, nLen + 4);
211
if ( nLen != (size_t)-1 )
218
wxCharBuffer buf((char *)NULL);
223
const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
225
wxASSERT(pOutSize != NULL);
227
const char* szEnd = szString + nStringLen + 1;
228
const char* szPos = szString;
229
const char* szStart = szPos;
231
size_t nActualLength = 0;
232
size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
234
wxWCharBuffer theBuffer(nCurrentSize);
236
//Convert the string until the length() is reached, continuing the
237
//loop every time a null character is reached
238
while(szPos != szEnd)
240
wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
242
//Get the length of the current (sub)string
243
size_t nLen = MB2WC(NULL, szPos, 0);
245
//Invalid conversion?
246
if( nLen == (size_t)-1 )
249
theBuffer.data()[0u] = wxT('\0');
254
//Increase the actual length (+1 for current null character)
255
nActualLength += nLen + 1;
257
//if buffer too big, realloc the buffer
258
if (nActualLength > (nCurrentSize+1))
260
wxWCharBuffer theNewBuffer(nCurrentSize << 1);
261
memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
262
theBuffer = theNewBuffer;
266
//Convert the current (sub)string
267
if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
270
theBuffer.data()[0u] = wxT('\0');
274
//Increment to next (sub)string
275
//Note that we have to use strlen instead of nLen here
276
//because XX2XX gives us the size of the output buffer,
277
//which is not necessarily the length of the string
278
szPos += strlen(szPos) + 1;
281
//success - return actual length and the buffer
282
*pOutSize = nActualLength;
286
const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
288
wxASSERT(pOutSize != NULL);
290
const wchar_t* szEnd = szString + nStringLen + 1;
291
const wchar_t* szPos = szString;
292
const wchar_t* szStart = szPos;
294
size_t nActualLength = 0;
295
size_t nCurrentSize = nStringLen << 2; //try * 4 first
297
wxCharBuffer theBuffer(nCurrentSize);
299
//Convert the string until the length() is reached, continuing the
300
//loop every time a null character is reached
301
while(szPos != szEnd)
303
wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
305
//Get the length of the current (sub)string
306
size_t nLen = WC2MB(NULL, szPos, 0);
308
//Invalid conversion?
309
if( nLen == (size_t)-1 )
312
theBuffer.data()[0u] = wxT('\0');
316
//Increase the actual length (+1 for current null character)
317
nActualLength += nLen + 1;
319
//if buffer too big, realloc the buffer
320
if (nActualLength > (nCurrentSize+1))
322
wxCharBuffer theNewBuffer(nCurrentSize << 1);
323
memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
324
theBuffer = theNewBuffer;
328
//Convert the current (sub)string
329
if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
332
theBuffer.data()[0u] = wxT('\0');
336
//Increment to next (sub)string
337
//Note that we have to use wxWcslen instead of nLen here
338
//because XX2XX gives us the size of the output buffer,
339
//which is not necessarily the length of the string
340
szPos += wxWcslen(szPos) + 1;
343
//success - return actual length and the buffer
344
*pOutSize = nActualLength;
348
// ----------------------------------------------------------------------------
350
// ----------------------------------------------------------------------------
352
size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
354
return wxMB2WC(buf, psz, n);
357
size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
359
return wxWC2MB(buf, psz, n);
364
// ----------------------------------------------------------------------------
365
// wxConvBrokenFileNames
366
// ----------------------------------------------------------------------------
368
wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
370
if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
371
|| wxStricmp(charset, _T("UTF8")) == 0 )
372
m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
374
m_conv = new wxCSConv(charset);
378
wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf,
380
size_t outputSize) const
382
return m_conv->MB2WC( outputBuf, psz, outputSize );
386
wxConvBrokenFileNames::WC2MB(char *outputBuf,
388
size_t outputSize) const
390
return m_conv->WC2MB( outputBuf, psz, outputSize );
395
// ----------------------------------------------------------------------------
397
// ----------------------------------------------------------------------------
399
// Implementation (C) 2004 Fredrik Roubert
402
// BASE64 decoding table
404
static const unsigned char utf7unb64[] =
406
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
407
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
408
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
409
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
410
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
411
0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
412
0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
413
0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
414
0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
415
0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
416
0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
417
0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
418
0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
419
0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
420
0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
421
0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
422
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
423
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
424
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
425
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
426
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
427
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
428
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
429
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
430
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
431
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
432
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
433
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
434
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
435
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
436
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
437
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
440
size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
444
while (*psz && ((!buf) || (len < n)))
446
unsigned char cc = *psz++;
454
else if (*psz == '-')
464
// BASE64 encoded string
468
for (lsb = false, d = 0, l = 0;
469
(cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
473
for (l += 6; l >= 8; lsb = !lsb)
475
c = (unsigned char)((d >> (l -= 8)) % 256);
484
*buf = (wchar_t)(c << 8);
491
if (buf && (len < n))
497
// BASE64 encoding table
499
static const unsigned char utf7enb64[] =
501
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
502
'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
503
'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
504
'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
505
'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
506
'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
507
'w', 'x', 'y', 'z', '0', '1', '2', '3',
508
'4', '5', '6', '7', '8', '9', '+', '/'
512
// UTF-7 encoding table
514
// 0 - Set D (directly encoded characters)
515
// 1 - Set O (optional direct characters)
516
// 2 - whitespace characters (optional)
517
// 3 - special characters
519
static const unsigned char utf7encode[128] =
521
3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
522
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
523
2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
524
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
525
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
527
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
528
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
531
size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
537
while (*psz && ((!buf) || (len < n)))
540
if (cc < 0x80 && utf7encode[cc] < 1)
548
else if (((wxUint32)cc) > 0xffff)
550
// no surrogate pair generation (yet?)
561
// BASE64 encode string
562
unsigned int lsb, d, l;
563
for (d = 0, l = 0;; psz++)
565
for (lsb = 0; lsb < 2; lsb ++)
568
d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
570
for (l += 8; l >= 6; )
574
*buf++ = utf7enb64[(d >> l) % 64];
579
if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
585
*buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
594
if (buf && (len < n))
599
// ----------------------------------------------------------------------------
601
// ----------------------------------------------------------------------------
603
static wxUint32 utf8_max[]=
604
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
606
// boundaries of the private use area we use to (temporarily) remap invalid
607
// characters invalid in a UTF-8 encoded string
608
const wxUint32 wxUnicodePUA = 0x100000;
609
const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
611
size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
615
while (*psz && ((!buf) || (len < n)))
617
const char *opsz = psz;
618
bool invalid = false;
619
unsigned char cc = *psz++, fc = cc;
621
for (cnt = 0; fc & 0x80; cnt++)
630
// escape the escape character for octal escapes
631
if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
632
&& cc == '\\' && (!buf || len < n))
644
// invalid UTF-8 sequence
649
unsigned ocnt = cnt - 1;
650
wxUint32 res = cc & (0x3f >> cnt);
654
if ((cc & 0xC0) != 0x80)
656
// invalid UTF-8 sequence
661
res = (res << 6) | (cc & 0x3f);
663
if (invalid || res <= utf8_max[ocnt])
665
// illegal UTF-8 encoding
668
else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
669
res >= wxUnicodePUA && res < wxUnicodePUAEnd)
671
// if one of our PUA characters turns up externally
672
// it must also be treated as an illegal sequence
673
// (a bit like you have to escape an escape character)
679
// cast is ok because wchar_t == wxUuint16 if WC_UTF16
680
size_t pa = encode_utf16(res, (wxUint16 *)buf);
681
if (pa == (size_t)-1)
695
#endif // WC_UTF16/!WC_UTF16
700
if (m_options & MAP_INVALID_UTF8_TO_PUA)
702
while (opsz < psz && (!buf || len < n))
705
// cast is ok because wchar_t == wxUuint16 if WC_UTF16
706
size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
707
wxASSERT(pa != (size_t)-1);
714
*buf++ = wxUnicodePUA + (unsigned char)*opsz;
720
else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
722
while (opsz < psz && (!buf || len < n))
724
if ( buf && len + 3 < n )
726
unsigned char n = *opsz;
728
*buf++ = (wchar_t)( L'0' + n / 0100 );
729
*buf++ = (wchar_t)( L'0' + (n % 0100) / 010 );
730
*buf++ = (wchar_t)( L'0' + n % 010 );
736
else // MAP_INVALID_UTF8_NOT
743
if (buf && (len < n))
748
static inline bool isoctal(wchar_t wch)
750
return L'0' <= wch && wch <= L'7';
753
size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
757
while (*psz && ((!buf) || (len < n)))
761
// cast is ok for WC_UTF16
762
size_t pa = decode_utf16((const wxUint16 *)psz, cc);
763
psz += (pa == (size_t)-1) ? 1 : pa;
765
cc=(*psz++) & 0x7fffffff;
768
if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
769
&& cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
772
*buf++ = (char)(cc - wxUnicodePUA);
775
else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
776
&& cc == L'\\' && psz[0] == L'\\' )
783
else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
785
isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
789
*buf++ = (char) ((psz[0] - L'0')*0100 +
790
(psz[1] - L'0')*010 +
800
for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
814
*buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
816
*buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
828
// ----------------------------------------------------------------------------
830
// ----------------------------------------------------------------------------
832
#ifdef WORDS_BIGENDIAN
833
#define wxMBConvUTF16straight wxMBConvUTF16BE
834
#define wxMBConvUTF16swap wxMBConvUTF16LE
836
#define wxMBConvUTF16swap wxMBConvUTF16BE
837
#define wxMBConvUTF16straight wxMBConvUTF16LE
843
// copy 16bit MB to 16bit String
844
size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
848
while (*(wxUint16*)psz && (!buf || len < n))
851
*buf++ = *(wxUint16*)psz;
854
psz += sizeof(wxUint16);
856
if (buf && len<n) *buf=0;
862
// copy 16bit String to 16bit MB
863
size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
867
while (*psz && (!buf || len < n))
871
*(wxUint16*)buf = *psz;
872
buf += sizeof(wxUint16);
874
len += sizeof(wxUint16);
877
if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
883
// swap 16bit MB to 16bit String
884
size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
888
while (*(wxUint16*)psz && (!buf || len < n))
892
((char *)buf)[0] = psz[1];
893
((char *)buf)[1] = psz[0];
897
psz += sizeof(wxUint16);
899
if (buf && len<n) *buf=0;
905
// swap 16bit MB to 16bit String
906
size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
910
while (*psz && (!buf || len < n))
914
*buf++ = ((char*)psz)[1];
915
*buf++ = ((char*)psz)[0];
917
len += sizeof(wxUint16);
920
if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
929
// copy 16bit MB to 32bit String
930
size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
934
while (*(wxUint16*)psz && (!buf || len < n))
937
size_t pa=decode_utf16((wxUint16*)psz, cc);
938
if (pa == (size_t)-1)
944
psz += pa * sizeof(wxUint16);
946
if (buf && len<n) *buf=0;
952
// copy 32bit String to 16bit MB
953
size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
957
while (*psz && (!buf || len < n))
960
size_t pa=encode_utf16(*psz, cc);
962
if (pa == (size_t)-1)
967
*(wxUint16*)buf = cc[0];
968
buf += sizeof(wxUint16);
971
*(wxUint16*)buf = cc[1];
972
buf += sizeof(wxUint16);
976
len += pa*sizeof(wxUint16);
979
if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
985
// swap 16bit MB to 32bit String
986
size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
990
while (*(wxUint16*)psz && (!buf || len < n))
994
tmp[0]=psz[1]; tmp[1]=psz[0];
995
tmp[2]=psz[3]; tmp[3]=psz[2];
997
size_t pa=decode_utf16((wxUint16*)tmp, cc);
998
if (pa == (size_t)-1)
1005
psz += pa * sizeof(wxUint16);
1007
if (buf && len<n) *buf=0;
1013
// swap 32bit String to 16bit MB
1014
size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1018
while (*psz && (!buf || len < n))
1021
size_t pa=encode_utf16(*psz, cc);
1023
if (pa == (size_t)-1)
1028
*buf++ = ((char*)cc)[1];
1029
*buf++ = ((char*)cc)[0];
1032
*buf++ = ((char*)cc)[3];
1033
*buf++ = ((char*)cc)[2];
1037
len += pa*sizeof(wxUint16);
1040
if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1048
// ----------------------------------------------------------------------------
1050
// ----------------------------------------------------------------------------
1052
#ifdef WORDS_BIGENDIAN
1053
#define wxMBConvUTF32straight wxMBConvUTF32BE
1054
#define wxMBConvUTF32swap wxMBConvUTF32LE
1056
#define wxMBConvUTF32swap wxMBConvUTF32BE
1057
#define wxMBConvUTF32straight wxMBConvUTF32LE
1061
WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1062
WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1067
// copy 32bit MB to 16bit String
1068
size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1072
while (*(wxUint32*)psz && (!buf || len < n))
1076
size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1077
if (pa == (size_t)-1)
1087
psz += sizeof(wxUint32);
1089
if (buf && len<n) *buf=0;
1095
// copy 16bit String to 32bit MB
1096
size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1100
while (*psz && (!buf || len < n))
1104
// cast is ok for WC_UTF16
1105
size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1106
if (pa == (size_t)-1)
1111
*(wxUint32*)buf = cc;
1112
buf += sizeof(wxUint32);
1114
len += sizeof(wxUint32);
1118
if (buf && len<=n-sizeof(wxUint32))
1126
// swap 32bit MB to 16bit String
1127
size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1131
while (*(wxUint32*)psz && (!buf || len < n))
1134
tmp[0] = psz[3]; tmp[1] = psz[2];
1135
tmp[2] = psz[1]; tmp[3] = psz[0];
1140
size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1141
if (pa == (size_t)-1)
1151
psz += sizeof(wxUint32);
1161
// swap 16bit String to 32bit MB
1162
size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1166
while (*psz && (!buf || len < n))
1170
// cast is ok for WC_UTF16
1171
size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1172
if (pa == (size_t)-1)
1182
len += sizeof(wxUint32);
1186
if (buf && len<=n-sizeof(wxUint32))
1195
// copy 32bit MB to 32bit String
1196
size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1200
while (*(wxUint32*)psz && (!buf || len < n))
1203
*buf++ = *(wxUint32*)psz;
1205
psz += sizeof(wxUint32);
1215
// copy 32bit String to 32bit MB
1216
size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1220
while (*psz && (!buf || len < n))
1224
*(wxUint32*)buf = *psz;
1225
buf += sizeof(wxUint32);
1228
len += sizeof(wxUint32);
1232
if (buf && len<=n-sizeof(wxUint32))
1239
// swap 32bit MB to 32bit String
1240
size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1244
while (*(wxUint32*)psz && (!buf || len < n))
1248
((char *)buf)[0] = psz[3];
1249
((char *)buf)[1] = psz[2];
1250
((char *)buf)[2] = psz[1];
1251
((char *)buf)[3] = psz[0];
1255
psz += sizeof(wxUint32);
1265
// swap 32bit String to 32bit MB
1266
size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1270
while (*psz && (!buf || len < n))
1274
*buf++ = ((char *)psz)[3];
1275
*buf++ = ((char *)psz)[2];
1276
*buf++ = ((char *)psz)[1];
1277
*buf++ = ((char *)psz)[0];
1279
len += sizeof(wxUint32);
1283
if (buf && len<=n-sizeof(wxUint32))
1293
// ============================================================================
1294
// The classes doing conversion using the iconv_xxx() functions
1295
// ============================================================================
1299
// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1300
// E2BIG if output buffer is _exactly_ as big as needed. Such case is
1301
// (unless there's yet another bug in glibc) the only case when iconv()
1302
// returns with (size_t)-1 (which means error) and says there are 0 bytes
1303
// left in the input buffer -- when _real_ error occurs,
1304
// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1306
// [This bug does not appear in glibc 2.2.]
1307
#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1308
#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1309
(errno != E2BIG || bufLeft != 0))
1311
#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1314
#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1316
// ----------------------------------------------------------------------------
1317
// wxMBConv_iconv: encapsulates an iconv character set
1318
// ----------------------------------------------------------------------------
1320
class wxMBConv_iconv : public wxMBConv
1323
wxMBConv_iconv(const wxChar *name);
1324
virtual ~wxMBConv_iconv();
1326
virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1327
virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1330
{ return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1333
// the iconv handlers used to translate from multibyte to wide char and in
1334
// the other direction
1338
// guards access to m2w and w2m objects
1339
wxMutex m_iconvMutex;
1343
// the name (for iconv_open()) of a wide char charset -- if none is
1344
// available on this machine, it will remain NULL
1345
static const char *ms_wcCharsetName;
1347
// true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1348
// different endian-ness than the native one
1349
static bool ms_wcNeedsSwap;
1352
// make the constructor available for unit testing
1353
WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1355
wxMBConv_iconv* result = new wxMBConv_iconv( name );
1356
if ( !result->IsOk() )
1364
const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1365
bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1367
wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1369
// iconv operates with chars, not wxChars, but luckily it uses only ASCII
1370
// names for the charsets
1371
const wxCharBuffer cname(wxString(name).ToAscii());
1373
// check for charset that represents wchar_t:
1374
if (ms_wcCharsetName == NULL)
1376
ms_wcNeedsSwap = false;
1378
// try charset with explicit bytesex info (e.g. "UCS-4LE"):
1379
ms_wcCharsetName = WC_NAME_BEST;
1380
m2w = iconv_open(ms_wcCharsetName, cname);
1382
if (m2w == (iconv_t)-1)
1384
// try charset w/o bytesex info (e.g. "UCS4")
1385
// and check for bytesex ourselves:
1386
ms_wcCharsetName = WC_NAME;
1387
m2w = iconv_open(ms_wcCharsetName, cname);
1389
// last bet, try if it knows WCHAR_T pseudo-charset
1390
if (m2w == (iconv_t)-1)
1392
ms_wcCharsetName = "WCHAR_T";
1393
m2w = iconv_open(ms_wcCharsetName, cname);
1396
if (m2w != (iconv_t)-1)
1398
char buf[2], *bufPtr;
1399
wchar_t wbuf[2], *wbufPtr;
1407
outsz = SIZEOF_WCHAR_T * 2;
1411
res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1412
(char**)&wbufPtr, &outsz);
1414
if (ICONV_FAILED(res, insz))
1416
ms_wcCharsetName = NULL;
1417
wxLogLastError(wxT("iconv"));
1418
wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1422
ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1427
ms_wcCharsetName = NULL;
1429
// VS: we must not output an error here, since wxWidgets will safely
1430
// fall back to using wxEncodingConverter.
1431
wxLogTrace(TRACE_STRCONV, wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1434
wxLogTrace(TRACE_STRCONV,
1435
wxT("wchar_t charset is '%s', needs swap: %i"),
1436
ms_wcCharsetName ? ms_wcCharsetName : "<none>", ms_wcNeedsSwap);
1438
else // we already have ms_wcCharsetName
1440
m2w = iconv_open(ms_wcCharsetName, cname);
1443
// NB: don't ever pass NULL to iconv_open(), it may crash!
1444
if ( ms_wcCharsetName )
1446
w2m = iconv_open( cname, ms_wcCharsetName);
1454
wxMBConv_iconv::~wxMBConv_iconv()
1456
if ( m2w != (iconv_t)-1 )
1458
if ( w2m != (iconv_t)-1 )
1462
size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1465
// NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1466
// Unfortunately there is a couple of global wxCSConv objects such as
1467
// wxConvLocal that are used all over wx code, so we have to make sure
1468
// the handle is used by at most one thread at the time. Otherwise
1469
// only a few wx classes would be safe to use from non-main threads
1470
// as MB<->WC conversion would fail "randomly".
1471
wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1474
size_t inbuf = strlen(psz);
1475
size_t outbuf = n * SIZEOF_WCHAR_T;
1477
// VS: Use these instead of psz, buf because iconv() modifies its arguments:
1478
wchar_t *bufPtr = buf;
1479
const char *pszPtr = psz;
1483
// have destination buffer, convert there
1485
ICONV_CHAR_CAST(&pszPtr), &inbuf,
1486
(char**)&bufPtr, &outbuf);
1487
res = n - (outbuf / SIZEOF_WCHAR_T);
1491
// convert to native endianness
1492
WC_BSWAP(buf /* _not_ bufPtr */, res)
1495
// NB: iconv was given only strlen(psz) characters on input, and so
1496
// it couldn't convert the trailing zero. Let's do it ourselves
1497
// if there's some room left for it in the output buffer.
1503
// no destination buffer... convert using temp buffer
1504
// to calculate destination buffer requirement
1509
outbuf = 8*SIZEOF_WCHAR_T;
1512
ICONV_CHAR_CAST(&pszPtr), &inbuf,
1513
(char**)&bufPtr, &outbuf );
1515
res += 8-(outbuf/SIZEOF_WCHAR_T);
1516
} while ((cres==(size_t)-1) && (errno==E2BIG));
1519
if (ICONV_FAILED(cres, inbuf))
1521
//VS: it is ok if iconv fails, hence trace only
1522
wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1529
size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1532
// NB: explained in MB2WC
1533
wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1536
size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1540
wchar_t *tmpbuf = 0;
1544
// need to copy to temp buffer to switch endianness
1545
// this absolutely doesn't rock!
1546
// (no, doing WC_BSWAP twice on the original buffer won't help, as it
1547
// could be in read-only memory, or be accessed in some other thread)
1548
tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1549
memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1550
WC_BSWAP(tmpbuf, inbuf)
1556
// have destination buffer, convert there
1557
cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1561
// NB: iconv was given only wcslen(psz) characters on input, and so
1562
// it couldn't convert the trailing zero. Let's do it ourselves
1563
// if there's some room left for it in the output buffer.
1569
// no destination buffer... convert using temp buffer
1570
// to calculate destination buffer requirement
1574
buf = tbuf; outbuf = 16;
1576
cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1579
} while ((cres==(size_t)-1) && (errno==E2BIG));
1587
if (ICONV_FAILED(cres, inbuf))
1589
//VS: it is ok if iconv fails, hence trace only
1590
wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1597
#endif // HAVE_ICONV
1600
// ============================================================================
1601
// Win32 conversion classes
1602
// ============================================================================
1604
#ifdef wxHAVE_WIN32_MB2WC
1608
extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1609
extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1612
class wxMBConv_win32 : public wxMBConv
1617
m_CodePage = CP_ACP;
1621
wxMBConv_win32(const wxChar* name)
1623
m_CodePage = wxCharsetToCodepage(name);
1626
wxMBConv_win32(wxFontEncoding encoding)
1628
m_CodePage = wxEncodingToCodepage(encoding);
1632
size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1634
// note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1635
// the behaviour is not compatible with the Unix version (using iconv)
1636
// and break the library itself, e.g. wxTextInputStream::NextChar()
1637
// wouldn't work if reading an incomplete MB char didn't result in an
1640
// note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1641
// an error (tested under Windows Server 2003) and apparently it is
1642
// done on purpose, i.e. the function accepts any input in this case
1643
// and although I'd prefer to return error on ill-formed output, our
1644
// own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1645
// explicitly ill-formed according to RFC 2152) neither so we don't
1646
// even have any fallback here...
1647
int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1649
const size_t len = ::MultiByteToWideChar
1651
m_CodePage, // code page
1652
flags, // flags: fall on error
1653
psz, // input string
1654
-1, // its length (NUL-terminated)
1655
buf, // output string
1656
buf ? n : 0 // size of output buffer
1659
// note that it returns count of written chars for buf != NULL and size
1660
// of the needed buffer for buf == NULL so in either case the length of
1661
// the string (which never includes the terminating NUL) is one less
1662
return len ? len - 1 : (size_t)-1;
1665
size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1668
we have a problem here: by default, WideCharToMultiByte() may
1669
replace characters unrepresentable in the target code page with bad
1670
quality approximations such as turning "1/2" symbol (U+00BD) into
1671
"1" for the code pages which don't have it and we, obviously, want
1672
to avoid this at any price
1674
the trouble is that this function does it _silently_, i.e. it won't
1675
even tell us whether it did or not... Win98/2000 and higher provide
1676
WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1677
we have to resort to a round trip, i.e. check that converting back
1678
results in the same string -- this is, of course, expensive but
1679
otherwise we simply can't be sure to not garble the data.
1682
// determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1683
// it doesn't work with CJK encodings (which we test for rather roughly
1684
// here...) nor with UTF-7/8 nor, of course, with Windows versions not
1686
BOOL usedDef wxDUMMY_INITIALIZE(false);
1689
if ( CanUseNoBestFit() && m_CodePage < 50000 )
1691
// it's our lucky day
1692
flags = WC_NO_BEST_FIT_CHARS;
1693
pUsedDef = &usedDef;
1695
else // old system or unsupported encoding
1701
const size_t len = ::WideCharToMultiByte
1703
m_CodePage, // code page
1704
flags, // either none or no best fit
1705
pwz, // input string
1706
-1, // it is (wide) NUL-terminated
1707
buf, // output buffer
1708
buf ? n : 0, // and its size
1709
NULL, // default "replacement" char
1710
pUsedDef // [out] was it used?
1715
// function totally failed
1719
// if we were really converting, check if we succeeded
1724
// check if the conversion failed, i.e. if any replacements
1729
else // we must resort to double tripping...
1731
wxWCharBuffer wcBuf(n);
1732
if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1733
wcscmp(wcBuf, pwz) != 0 )
1735
// we didn't obtain the same thing we started from, hence
1736
// the conversion was lossy and we consider that it failed
1742
// see the comment above for the reason of "len - 1"
1746
bool IsOk() const { return m_CodePage != -1; }
1749
static bool CanUseNoBestFit()
1751
static int s_isWin98Or2k = -1;
1753
if ( s_isWin98Or2k == -1 )
1756
switch ( wxGetOsVersion(&verMaj, &verMin) )
1759
s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1763
s_isWin98Or2k = verMaj >= 5;
1767
// unknown, be conseravtive by default
1771
wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1774
return s_isWin98Or2k == 1;
1780
#endif // wxHAVE_WIN32_MB2WC
1782
// ============================================================================
1783
// Cocoa conversion classes
1784
// ============================================================================
1786
#if defined(__WXCOCOA__)
1788
// RN: There is no UTF-32 support in either Core Foundation or
1789
// Cocoa. Strangely enough, internally Core Foundation uses
1790
// UTF 32 internally quite a bit - its just not public (yet).
1792
#include <CoreFoundation/CFString.h>
1793
#include <CoreFoundation/CFStringEncodingExt.h>
1795
CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1797
CFStringEncoding enc = kCFStringEncodingInvalidId ;
1798
if ( encoding == wxFONTENCODING_DEFAULT )
1800
enc = CFStringGetSystemEncoding();
1802
else switch( encoding)
1804
case wxFONTENCODING_ISO8859_1 :
1805
enc = kCFStringEncodingISOLatin1 ;
1807
case wxFONTENCODING_ISO8859_2 :
1808
enc = kCFStringEncodingISOLatin2;
1810
case wxFONTENCODING_ISO8859_3 :
1811
enc = kCFStringEncodingISOLatin3 ;
1813
case wxFONTENCODING_ISO8859_4 :
1814
enc = kCFStringEncodingISOLatin4;
1816
case wxFONTENCODING_ISO8859_5 :
1817
enc = kCFStringEncodingISOLatinCyrillic;
1819
case wxFONTENCODING_ISO8859_6 :
1820
enc = kCFStringEncodingISOLatinArabic;
1822
case wxFONTENCODING_ISO8859_7 :
1823
enc = kCFStringEncodingISOLatinGreek;
1825
case wxFONTENCODING_ISO8859_8 :
1826
enc = kCFStringEncodingISOLatinHebrew;
1828
case wxFONTENCODING_ISO8859_9 :
1829
enc = kCFStringEncodingISOLatin5;
1831
case wxFONTENCODING_ISO8859_10 :
1832
enc = kCFStringEncodingISOLatin6;
1834
case wxFONTENCODING_ISO8859_11 :
1835
enc = kCFStringEncodingISOLatinThai;
1837
case wxFONTENCODING_ISO8859_13 :
1838
enc = kCFStringEncodingISOLatin7;
1840
case wxFONTENCODING_ISO8859_14 :
1841
enc = kCFStringEncodingISOLatin8;
1843
case wxFONTENCODING_ISO8859_15 :
1844
enc = kCFStringEncodingISOLatin9;
1847
case wxFONTENCODING_KOI8 :
1848
enc = kCFStringEncodingKOI8_R;
1850
case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1851
enc = kCFStringEncodingDOSRussian;
1854
// case wxFONTENCODING_BULGARIAN :
1858
case wxFONTENCODING_CP437 :
1859
enc =kCFStringEncodingDOSLatinUS ;
1861
case wxFONTENCODING_CP850 :
1862
enc = kCFStringEncodingDOSLatin1;
1864
case wxFONTENCODING_CP852 :
1865
enc = kCFStringEncodingDOSLatin2;
1867
case wxFONTENCODING_CP855 :
1868
enc = kCFStringEncodingDOSCyrillic;
1870
case wxFONTENCODING_CP866 :
1871
enc =kCFStringEncodingDOSRussian ;
1873
case wxFONTENCODING_CP874 :
1874
enc = kCFStringEncodingDOSThai;
1876
case wxFONTENCODING_CP932 :
1877
enc = kCFStringEncodingDOSJapanese;
1879
case wxFONTENCODING_CP936 :
1880
enc =kCFStringEncodingDOSChineseSimplif ;
1882
case wxFONTENCODING_CP949 :
1883
enc = kCFStringEncodingDOSKorean;
1885
case wxFONTENCODING_CP950 :
1886
enc = kCFStringEncodingDOSChineseTrad;
1888
case wxFONTENCODING_CP1250 :
1889
enc = kCFStringEncodingWindowsLatin2;
1891
case wxFONTENCODING_CP1251 :
1892
enc =kCFStringEncodingWindowsCyrillic ;
1894
case wxFONTENCODING_CP1252 :
1895
enc =kCFStringEncodingWindowsLatin1 ;
1897
case wxFONTENCODING_CP1253 :
1898
enc = kCFStringEncodingWindowsGreek;
1900
case wxFONTENCODING_CP1254 :
1901
enc = kCFStringEncodingWindowsLatin5;
1903
case wxFONTENCODING_CP1255 :
1904
enc =kCFStringEncodingWindowsHebrew ;
1906
case wxFONTENCODING_CP1256 :
1907
enc =kCFStringEncodingWindowsArabic ;
1909
case wxFONTENCODING_CP1257 :
1910
enc = kCFStringEncodingWindowsBalticRim;
1912
// This only really encodes to UTF7 (if that) evidently
1913
// case wxFONTENCODING_UTF7 :
1914
// enc = kCFStringEncodingNonLossyASCII ;
1916
case wxFONTENCODING_UTF8 :
1917
enc = kCFStringEncodingUTF8 ;
1919
case wxFONTENCODING_EUC_JP :
1920
enc = kCFStringEncodingEUC_JP;
1922
case wxFONTENCODING_UTF16 :
1923
enc = kCFStringEncodingUnicode ;
1925
case wxFONTENCODING_MACROMAN :
1926
enc = kCFStringEncodingMacRoman ;
1928
case wxFONTENCODING_MACJAPANESE :
1929
enc = kCFStringEncodingMacJapanese ;
1931
case wxFONTENCODING_MACCHINESETRAD :
1932
enc = kCFStringEncodingMacChineseTrad ;
1934
case wxFONTENCODING_MACKOREAN :
1935
enc = kCFStringEncodingMacKorean ;
1937
case wxFONTENCODING_MACARABIC :
1938
enc = kCFStringEncodingMacArabic ;
1940
case wxFONTENCODING_MACHEBREW :
1941
enc = kCFStringEncodingMacHebrew ;
1943
case wxFONTENCODING_MACGREEK :
1944
enc = kCFStringEncodingMacGreek ;
1946
case wxFONTENCODING_MACCYRILLIC :
1947
enc = kCFStringEncodingMacCyrillic ;
1949
case wxFONTENCODING_MACDEVANAGARI :
1950
enc = kCFStringEncodingMacDevanagari ;
1952
case wxFONTENCODING_MACGURMUKHI :
1953
enc = kCFStringEncodingMacGurmukhi ;
1955
case wxFONTENCODING_MACGUJARATI :
1956
enc = kCFStringEncodingMacGujarati ;
1958
case wxFONTENCODING_MACORIYA :
1959
enc = kCFStringEncodingMacOriya ;
1961
case wxFONTENCODING_MACBENGALI :
1962
enc = kCFStringEncodingMacBengali ;
1964
case wxFONTENCODING_MACTAMIL :
1965
enc = kCFStringEncodingMacTamil ;
1967
case wxFONTENCODING_MACTELUGU :
1968
enc = kCFStringEncodingMacTelugu ;
1970
case wxFONTENCODING_MACKANNADA :
1971
enc = kCFStringEncodingMacKannada ;
1973
case wxFONTENCODING_MACMALAJALAM :
1974
enc = kCFStringEncodingMacMalayalam ;
1976
case wxFONTENCODING_MACSINHALESE :
1977
enc = kCFStringEncodingMacSinhalese ;
1979
case wxFONTENCODING_MACBURMESE :
1980
enc = kCFStringEncodingMacBurmese ;
1982
case wxFONTENCODING_MACKHMER :
1983
enc = kCFStringEncodingMacKhmer ;
1985
case wxFONTENCODING_MACTHAI :
1986
enc = kCFStringEncodingMacThai ;
1988
case wxFONTENCODING_MACLAOTIAN :
1989
enc = kCFStringEncodingMacLaotian ;
1991
case wxFONTENCODING_MACGEORGIAN :
1992
enc = kCFStringEncodingMacGeorgian ;
1994
case wxFONTENCODING_MACARMENIAN :
1995
enc = kCFStringEncodingMacArmenian ;
1997
case wxFONTENCODING_MACCHINESESIMP :
1998
enc = kCFStringEncodingMacChineseSimp ;
2000
case wxFONTENCODING_MACTIBETAN :
2001
enc = kCFStringEncodingMacTibetan ;
2003
case wxFONTENCODING_MACMONGOLIAN :
2004
enc = kCFStringEncodingMacMongolian ;
2006
case wxFONTENCODING_MACETHIOPIC :
2007
enc = kCFStringEncodingMacEthiopic ;
2009
case wxFONTENCODING_MACCENTRALEUR :
2010
enc = kCFStringEncodingMacCentralEurRoman ;
2012
case wxFONTENCODING_MACVIATNAMESE :
2013
enc = kCFStringEncodingMacVietnamese ;
2015
case wxFONTENCODING_MACARABICEXT :
2016
enc = kCFStringEncodingMacExtArabic ;
2018
case wxFONTENCODING_MACSYMBOL :
2019
enc = kCFStringEncodingMacSymbol ;
2021
case wxFONTENCODING_MACDINGBATS :
2022
enc = kCFStringEncodingMacDingbats ;
2024
case wxFONTENCODING_MACTURKISH :
2025
enc = kCFStringEncodingMacTurkish ;
2027
case wxFONTENCODING_MACCROATIAN :
2028
enc = kCFStringEncodingMacCroatian ;
2030
case wxFONTENCODING_MACICELANDIC :
2031
enc = kCFStringEncodingMacIcelandic ;
2033
case wxFONTENCODING_MACROMANIAN :
2034
enc = kCFStringEncodingMacRomanian ;
2036
case wxFONTENCODING_MACCELTIC :
2037
enc = kCFStringEncodingMacCeltic ;
2039
case wxFONTENCODING_MACGAELIC :
2040
enc = kCFStringEncodingMacGaelic ;
2042
// case wxFONTENCODING_MACKEYBOARD :
2043
// enc = kCFStringEncodingMacKeyboardGlyphs ;
2046
// because gcc is picky
2052
class wxMBConv_cocoa : public wxMBConv
2057
Init(CFStringGetSystemEncoding()) ;
2061
wxMBConv_cocoa(const wxChar* name)
2063
Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2067
wxMBConv_cocoa(wxFontEncoding encoding)
2069
Init( wxCFStringEncFromFontEnc(encoding) );
2076
void Init( CFStringEncoding encoding)
2078
m_encoding = encoding ;
2081
size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2085
CFStringRef theString = CFStringCreateWithBytes (
2086
NULL, //the allocator
2087
(const UInt8*)szUnConv,
2090
false //no BOM/external representation
2093
wxASSERT(theString);
2095
size_t nOutLength = CFStringGetLength(theString);
2099
CFRelease(theString);
2103
CFRange theRange = { 0, nOutSize };
2105
#if SIZEOF_WCHAR_T == 4
2106
UniChar* szUniCharBuffer = new UniChar[nOutSize];
2109
CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2111
CFRelease(theString);
2113
szUniCharBuffer[nOutLength] = '\0' ;
2115
#if SIZEOF_WCHAR_T == 4
2116
wxMBConvUTF16 converter ;
2117
converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2118
delete[] szUniCharBuffer;
2124
size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2128
size_t nRealOutSize;
2129
size_t nBufSize = wxWcslen(szUnConv);
2130
UniChar* szUniBuffer = (UniChar*) szUnConv;
2132
#if SIZEOF_WCHAR_T == 4
2133
wxMBConvUTF16 converter ;
2134
nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2135
szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2136
converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2137
nBufSize /= sizeof(UniChar);
2140
CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2144
kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2147
wxASSERT(theString);
2149
//Note that CER puts a BOM when converting to unicode
2150
//so we check and use getchars instead in that case
2151
if (m_encoding == kCFStringEncodingUnicode)
2154
CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2156
nRealOutSize = CFStringGetLength(theString) + 1;
2162
CFRangeMake(0, CFStringGetLength(theString)),
2164
0, //what to put in characters that can't be converted -
2165
//0 tells CFString to return NULL if it meets such a character
2166
false, //not an external representation
2169
(CFIndex*) &nRealOutSize
2173
CFRelease(theString);
2175
#if SIZEOF_WCHAR_T == 4
2176
delete[] szUniBuffer;
2179
return nRealOutSize - 1;
2184
return m_encoding != kCFStringEncodingInvalidId &&
2185
CFStringIsEncodingAvailable(m_encoding);
2189
CFStringEncoding m_encoding ;
2192
#endif // defined(__WXCOCOA__)
2194
// ============================================================================
2195
// Mac conversion classes
2196
// ============================================================================
2198
#if defined(__WXMAC__) && defined(TARGET_CARBON)
2200
class wxMBConv_mac : public wxMBConv
2205
Init(CFStringGetSystemEncoding()) ;
2209
wxMBConv_mac(const wxChar* name)
2211
Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2215
wxMBConv_mac(wxFontEncoding encoding)
2217
Init( wxMacGetSystemEncFromFontEnc(encoding) );
2222
OSStatus status = noErr ;
2223
status = TECDisposeConverter(m_MB2WC_converter);
2224
status = TECDisposeConverter(m_WC2MB_converter);
2228
void Init( TextEncodingBase encoding)
2230
OSStatus status = noErr ;
2231
m_char_encoding = encoding ;
2232
m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2234
status = TECCreateConverter(&m_MB2WC_converter,
2236
m_unicode_encoding);
2237
status = TECCreateConverter(&m_WC2MB_converter,
2242
size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2244
OSStatus status = noErr ;
2245
ByteCount byteOutLen ;
2246
ByteCount byteInLen = strlen(psz) ;
2247
wchar_t *tbuf = NULL ;
2248
UniChar* ubuf = NULL ;
2253
//apple specs say at least 32
2254
n = wxMax( 32 , byteInLen ) ;
2255
tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2257
ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2258
#if SIZEOF_WCHAR_T == 4
2259
ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2261
ubuf = (UniChar*) (buf ? buf : tbuf) ;
2263
status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2264
(TextPtr) ubuf , byteBufferLen, &byteOutLen);
2265
#if SIZEOF_WCHAR_T == 4
2266
// we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2267
// is not properly terminated we get random characters at the end
2268
ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2269
wxMBConvUTF16 converter ;
2270
res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2273
res = byteOutLen / sizeof( UniChar ) ;
2278
if ( buf && res < n)
2284
size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2286
OSStatus status = noErr ;
2287
ByteCount byteOutLen ;
2288
ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2294
//apple specs say at least 32
2295
n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2296
tbuf = (char*) malloc( n ) ;
2299
ByteCount byteBufferLen = n ;
2300
UniChar* ubuf = NULL ;
2301
#if SIZEOF_WCHAR_T == 4
2302
wxMBConvUTF16 converter ;
2303
size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2304
byteInLen = unicharlen ;
2305
ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2306
converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2308
ubuf = (UniChar*) psz ;
2310
status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2311
(TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2312
#if SIZEOF_WCHAR_T == 4
2318
size_t res = byteOutLen ;
2319
if ( buf && res < n)
2323
//we need to double-trip to verify it didn't insert any ? in place
2324
//of bogus characters
2325
wxWCharBuffer wcBuf(n);
2326
size_t pszlen = wxWcslen(psz);
2327
if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2328
wxWcslen(wcBuf) != pszlen ||
2329
memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2331
// we didn't obtain the same thing we started from, hence
2332
// the conversion was lossy and we consider that it failed
2341
{ return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2344
TECObjectRef m_MB2WC_converter ;
2345
TECObjectRef m_WC2MB_converter ;
2347
TextEncodingBase m_char_encoding ;
2348
TextEncodingBase m_unicode_encoding ;
2351
#endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2353
// ============================================================================
2354
// wxEncodingConverter based conversion classes
2355
// ============================================================================
2359
class wxMBConv_wxwin : public wxMBConv
2364
m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2365
w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2369
// temporarily just use wxEncodingConverter stuff,
2370
// so that it works while a better implementation is built
2371
wxMBConv_wxwin(const wxChar* name)
2374
m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2376
m_enc = wxFONTENCODING_SYSTEM;
2381
wxMBConv_wxwin(wxFontEncoding enc)
2388
size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2390
size_t inbuf = strlen(psz);
2393
if (!m2w.Convert(psz,buf))
2399
size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2401
const size_t inbuf = wxWcslen(psz);
2404
if (!w2m.Convert(psz,buf))
2411
bool IsOk() const { return m_ok; }
2414
wxFontEncoding m_enc;
2415
wxEncodingConverter m2w, w2m;
2417
// were we initialized successfully?
2420
DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2423
// make the constructors available for unit testing
2424
WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2426
wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2427
if ( !result->IsOk() )
2435
#endif // wxUSE_FONTMAP
2437
// ============================================================================
2438
// wxCSConv implementation
2439
// ============================================================================
2441
void wxCSConv::Init()
2448
wxCSConv::wxCSConv(const wxChar *charset)
2457
m_encoding = wxFONTENCODING_SYSTEM;
2460
wxCSConv::wxCSConv(wxFontEncoding encoding)
2462
if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2464
wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2466
encoding = wxFONTENCODING_SYSTEM;
2471
m_encoding = encoding;
2474
wxCSConv::~wxCSConv()
2479
wxCSConv::wxCSConv(const wxCSConv& conv)
2484
SetName(conv.m_name);
2485
m_encoding = conv.m_encoding;
2488
wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2492
SetName(conv.m_name);
2493
m_encoding = conv.m_encoding;
2498
void wxCSConv::Clear()
2507
void wxCSConv::SetName(const wxChar *charset)
2511
m_name = wxStrdup(charset);
2517
#include "wx/hashmap.h"
2519
WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2520
wxEncodingNameCache );
2522
static wxEncodingNameCache gs_nameCache;
2525
wxMBConv *wxCSConv::DoCreate() const
2528
wxLogTrace(TRACE_STRCONV,
2529
wxT("creating conversion for %s"),
2531
: wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2532
#endif // wxUSE_FONTMAP
2534
// check for the special case of ASCII or ISO8859-1 charset: as we have
2535
// special knowledge of it anyhow, we don't need to create a special
2536
// conversion object
2537
if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2539
// don't convert at all
2543
// we trust OS to do conversion better than we can so try external
2544
// conversion methods first
2546
// the full order is:
2547
// 1. OS conversion (iconv() under Unix or Win32 API)
2548
// 2. hard coded conversions for UTF
2549
// 3. wxEncodingConverter as fall back
2555
#endif // !wxUSE_FONTMAP
2557
wxString name(m_name);
2558
wxFontEncoding encoding(m_encoding);
2560
if ( !name.empty() )
2562
wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2570
wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2571
#endif // wxUSE_FONTMAP
2575
const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2576
if ( it != gs_nameCache.end() )
2578
if ( it->second.empty() )
2581
wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2588
const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2590
for ( ; *names; ++names )
2592
wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2595
gs_nameCache[encoding] = *names;
2602
gs_nameCache[encoding] = _T(""); // cache the failure
2604
#endif // wxUSE_FONTMAP
2606
#endif // HAVE_ICONV
2608
#ifdef wxHAVE_WIN32_MB2WC
2611
wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2612
: new wxMBConv_win32(m_encoding);
2621
#endif // wxHAVE_WIN32_MB2WC
2622
#if defined(__WXMAC__)
2624
// leave UTF16 and UTF32 to the built-ins of wx
2625
if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2626
( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2630
wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2631
: new wxMBConv_mac(m_encoding);
2633
wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2642
#if defined(__WXCOCOA__)
2644
if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2648
wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2649
: new wxMBConv_cocoa(m_encoding);
2651
wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2661
wxFontEncoding enc = m_encoding;
2663
if ( enc == wxFONTENCODING_SYSTEM && m_name )
2665
// use "false" to suppress interactive dialogs -- we can be called from
2666
// anywhere and popping up a dialog from here is the last thing we want to
2668
enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2670
#endif // wxUSE_FONTMAP
2674
case wxFONTENCODING_UTF7:
2675
return new wxMBConvUTF7;
2677
case wxFONTENCODING_UTF8:
2678
return new wxMBConvUTF8;
2680
case wxFONTENCODING_UTF16BE:
2681
return new wxMBConvUTF16BE;
2683
case wxFONTENCODING_UTF16LE:
2684
return new wxMBConvUTF16LE;
2686
case wxFONTENCODING_UTF32BE:
2687
return new wxMBConvUTF32BE;
2689
case wxFONTENCODING_UTF32LE:
2690
return new wxMBConvUTF32LE;
2693
// nothing to do but put here to suppress gcc warnings
2700
wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2701
: new wxMBConv_wxwin(m_encoding);
2707
#endif // wxUSE_FONTMAP
2709
// NB: This is a hack to prevent deadlock. What could otherwise happen
2710
// in Unicode build: wxConvLocal creation ends up being here
2711
// because of some failure and logs the error. But wxLog will try to
2712
// attach timestamp, for which it will need wxConvLocal (to convert
2713
// time to char* and then wchar_t*), but that fails, tries to log
2714
// error, but wxLog has a (already locked) critical section that
2715
// guards static buffer.
2716
static bool alreadyLoggingError = false;
2717
if (!alreadyLoggingError)
2719
alreadyLoggingError = true;
2720
wxLogError(_("Cannot convert from the charset '%s'!"),
2724
wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2725
#else // !wxUSE_FONTMAP
2726
wxString::Format(_("encoding %s"), m_encoding).c_str()
2727
#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2729
alreadyLoggingError = false;
2735
void wxCSConv::CreateConvIfNeeded() const
2739
wxCSConv *self = (wxCSConv *)this; // const_cast
2742
// if we don't have neither the name nor the encoding, use the default
2743
// encoding for this system
2744
if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2746
self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2748
#endif // wxUSE_INTL
2750
self->m_convReal = DoCreate();
2751
self->m_deferred = false;
2755
size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2757
CreateConvIfNeeded();
2760
return m_convReal->MB2WC(buf, psz, n);
2763
size_t len = strlen(psz);
2767
for (size_t c = 0; c <= len; c++)
2768
buf[c] = (unsigned char)(psz[c]);
2774
size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2776
CreateConvIfNeeded();
2779
return m_convReal->WC2MB(buf, psz, n);
2782
const size_t len = wxWcslen(psz);
2785
for (size_t c = 0; c <= len; c++)
2789
buf[c] = (char)psz[c];
2794
for (size_t c = 0; c <= len; c++)
2804
// ----------------------------------------------------------------------------
2806
// ----------------------------------------------------------------------------
2809
static wxMBConv_win32 wxConvLibcObj;
2810
#elif defined(__WXMAC__) && !defined(__MACH__)
2811
static wxMBConv_mac wxConvLibcObj ;
2813
static wxMBConvLibc wxConvLibcObj;
2816
static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2817
static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2818
static wxMBConvUTF7 wxConvUTF7Obj;
2819
static wxMBConvUTF8 wxConvUTF8Obj;
2821
WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2822
WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2823
WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2824
WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2825
WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2826
WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2827
WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2835
#else // !wxUSE_WCHAR_T
2837
// stand-ins in absence of wchar_t
2838
WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2843
#endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T