2
* WinPR: Windows Portable Runtime
3
* Unicode Conversion (CRT)
5
* Copyright 2012 Marc-Andre Moreau <marcandre.moreau@gmail.com>
7
* Licensed under the Apache License, Version 2.0 (the "License");
8
* you may not use this file except in compliance with the License.
9
* You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
27
#include <winpr/crt.h>
28
#include <winpr/print.h>
35
* Notes on cross-platform Unicode portability:
37
* Unicode has many possible Unicode Transformation Format (UTF) encodings,
38
* where some of the most commonly used are UTF-8, UTF-16 and sometimes UTF-32.
40
* The number in the UTF encoding name (8, 16, 32) refers to the number of bits
41
* per code unit. A code unit is the minimal bit combination that can represent
42
* a unit of encoded text in the given encoding. For instance, UTF-8 encodes
43
* the English alphabet using 8 bits (or one byte) each, just like in ASCII.
45
* However, the total number of code points (values in the Unicode codespace)
46
* only fits completely within 32 bits. This means that for UTF-8 and UTF-16,
47
* more than one code unit may be required to fully encode a specific value.
48
* UTF-8 and UTF-16 are variable-width encodings, while UTF-32 is fixed-width.
50
* UTF-8 has the advantage of being backwards compatible with ASCII, and is
51
* one of the most commonly used Unicode encoding.
53
* UTF-16 is used everywhere in the Windows API. The strategy employed by
54
* Microsoft to provide backwards compatibility in their API was to create
55
* an ANSI and a Unicode version of the same function, ending with A (ANSI)
56
* and W (Wide character, or UTF-16 Unicode). In headers, the original
57
* function name is replaced by a macro that defines to either the ANSI
58
* or Unicode version based on the definition of the _UNICODE macro.
60
* UTF-32 has the advantage of being fixed width, but wastes a lot of space
61
* for English text (4x more than UTF-8, 2x more than UTF-16).
63
* In C, wide character strings are often defined with the wchar_t type.
64
* Many functions are provided to deal with those wide character strings,
65
* such as wcslen (strlen equivalent) or wprintf (printf equivalent).
67
* This may lead to some confusion, since many of these functions exist
68
* on both Windows and Linux, but they are *not* the same!
70
* This sample hello world is a good example:
74
* wchar_t hello[] = L"Hello, World!\n";
76
* int main(int argc, char** argv)
79
* wprintf(L"sizeof(wchar_t): %d\n", sizeof(wchar_t));
83
* There is a reason why the sample prints the size of the wchar_t type:
84
* On Windows, wchar_t is two bytes (UTF-16), while on most other systems
85
* it is 4 bytes (UTF-32). This means that if you write code on Windows,
86
* use L"" to define a string which is meant to be UTF-16 and not UTF-32,
87
* you will have a little surprise when trying to port your code to Linux.
89
* Since the Windows API uses UTF-16, not UTF-32, WinPR defines the WCHAR
90
* type to always be 2-bytes long and uses it instead of wchar_t. Do not
91
* ever use wchar_t with WinPR unless you know what you are doing.
93
* As for L"", it is unfortunately unusable in a portable way, unless a
94
* special option is passed to GCC to define wchar_t as being two bytes.
95
* For string constants that must be UTF-16, it is a pain, but they can
96
* be defined in a portable way like this:
98
* WCHAR hello[] = { 'H','e','l','l','o','\0' };
100
* Such strings cannot be passed to native functions like wcslen(), which
101
* may expect a different wchar_t size. For this reason, WinPR provides
102
* _wcslen, which expects UTF-16 WCHAR strings on all platforms.
107
* Conversion to Unicode (UTF-16)
108
* MultiByteToWideChar: http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072/
110
* cbMultiByte is an input size in bytes (BYTE)
111
* cchWideChar is an output size in wide characters (WCHAR)
113
* Null-terminated UTF-8 strings:
115
* cchWideChar *cannot* be assumed to be cbMultiByte since UTF-8 is variable-width!
117
* Instead, obtain the required cchWideChar output size like this:
118
* cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, NULL, 0);
120
* A value of -1 for cbMultiByte indicates that the input string is null-terminated,
121
* and the null terminator *will* be processed. The size returned by MultiByteToWideChar
122
* will therefore include the null terminator. Equivalent behavior can be obtained by
123
* computing the length in bytes of the input buffer, including the null terminator:
125
* cbMultiByte = strlen((char*) lpMultiByteStr) + 1;
127
* An output buffer of the proper size can then be allocated:
129
* lpWideCharStr = (LPWSTR) malloc(cchWideChar * sizeof(WCHAR));
131
* Since cchWideChar is an output size in wide characters, the actual buffer size is:
132
* (cchWideChar * sizeof(WCHAR)) or (cchWideChar * 2)
134
* Finally, perform the conversion:
136
* cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, lpWideCharStr, cchWideChar);
138
* The value returned by MultiByteToWideChar corresponds to the number of wide characters written
139
* to the output buffer, and should match the value obtained on the first call to MultiByteToWideChar.
143
int MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr,
144
int cbMultiByte, LPWSTR lpWideCharStr, int cchWideChar)
148
const BYTE* sourceStart;
149
ConversionResult result;
151
/* If cbMultiByte is 0, the function fails */
153
if (cbMultiByte == 0)
156
/* If cbMultiByte is -1, the string is null-terminated */
158
if (cbMultiByte == -1)
159
cbMultiByte = strlen((char*) lpMultiByteStr) + 1;
162
* if cchWideChar is 0, the function returns the required buffer size
163
* in characters for lpWideCharStr and makes no use of the output parameter itself.
166
if (cchWideChar == 0)
168
sourceStart = (const BYTE*) lpMultiByteStr;
169
targetStart = (WCHAR*) NULL;
171
result = ConvertUTF8toUTF16(&sourceStart, &sourceStart[cbMultiByte],
172
&targetStart, NULL, strictConversion);
174
length = targetStart - ((WCHAR*) NULL);
175
cchWideChar = length;
179
sourceStart = (const BYTE*) lpMultiByteStr;
180
targetStart = lpWideCharStr;
182
result = ConvertUTF8toUTF16(&sourceStart, &sourceStart[cbMultiByte],
183
&targetStart, &targetStart[cchWideChar], strictConversion);
185
length = targetStart - ((WCHAR*) lpWideCharStr);
186
cchWideChar = length;
193
* Conversion from Unicode (UTF-16)
194
* WideCharToMultiByte: http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130/
196
* cchWideChar is an input size in wide characters (WCHAR)
197
* cbMultiByte is an output size in bytes (BYTE)
199
* Null-terminated UTF-16 strings:
201
* cbMultiByte *cannot* be assumed to be cchWideChar since UTF-8 is variable-width!
203
* Instead, obtain the required cbMultiByte output size like this:
204
* cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, NULL, 0, NULL, NULL);
206
* A value of -1 for cbMultiByte indicates that the input string is null-terminated,
207
* and the null terminator *will* be processed. The size returned by WideCharToMultiByte
208
* will therefore include the null terminator. Equivalent behavior can be obtained by
209
* computing the length in bytes of the input buffer, including the null terminator:
211
* cchWideChar = _wcslen((WCHAR*) lpWideCharStr) + 1;
213
* An output buffer of the proper size can then be allocated:
214
* lpMultiByteStr = (LPSTR) malloc(cbMultiByte);
216
* Since cbMultiByte is an output size in bytes, it is the same as the buffer size
218
* Finally, perform the conversion:
220
* cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, lpMultiByteStr, cbMultiByte, NULL, NULL);
222
* The value returned by WideCharToMultiByte corresponds to the number of bytes written
223
* to the output buffer, and should match the value obtained on the first call to WideCharToMultiByte.
227
int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
228
LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar)
232
const WCHAR* sourceStart;
233
ConversionResult result;
235
/* If cchWideChar is 0, the function fails */
237
if (cchWideChar == 0)
240
/* If cchWideChar is -1, the string is null-terminated */
242
if (cchWideChar == -1)
243
cchWideChar = _wcslen(lpWideCharStr) + 1;
246
* if cbMultiByte is 0, the function returns the required buffer size
247
* in bytes for lpMultiByteStr and makes no use of the output parameter itself.
250
if (cbMultiByte == 0)
252
sourceStart = (WCHAR*) lpWideCharStr;
253
targetStart = (BYTE*) NULL;
255
result = ConvertUTF16toUTF8(&sourceStart, &sourceStart[cchWideChar],
256
&targetStart, NULL, strictConversion);
258
length = targetStart - ((BYTE*) NULL);
259
cbMultiByte = length;
263
sourceStart = (WCHAR*) lpWideCharStr;
264
targetStart = (BYTE*) lpMultiByteStr;
266
result = ConvertUTF16toUTF8(&sourceStart, &sourceStart[cchWideChar],
267
&targetStart, &targetStart[cbMultiByte], strictConversion);
269
length = targetStart - ((BYTE*) lpMultiByteStr);
270
cbMultiByte = length;
278
int ConvertToUnicode(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr,
279
int cbMultiByte, LPWSTR* lpWideCharStr, int cchWideChar)
282
BOOL allocate = FALSE;
290
if (cbMultiByte == -1)
291
cbMultiByte = strlen(lpMultiByteStr) + 1;
293
if (cchWideChar == 0)
295
cchWideChar = MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, NULL, 0);
302
if (!(*lpWideCharStr))
306
*lpWideCharStr = (LPWSTR) malloc(cchWideChar * sizeof(WCHAR));
308
status = MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, *lpWideCharStr, cchWideChar);
310
if (status != cchWideChar)
316
int ConvertFromUnicode(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
317
LPSTR* lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar)
320
BOOL allocate = FALSE;
328
if (cchWideChar == -1)
329
cchWideChar = _wcslen(lpWideCharStr) + 1;
331
if (cbMultiByte == 0)
333
cbMultiByte = WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, NULL, 0, NULL, NULL);
340
if (!(*lpMultiByteStr))
345
*lpMultiByteStr = (LPSTR) malloc(cbMultiByte + 1);
346
ZeroMemory(*lpMultiByteStr, cbMultiByte + 1);
349
status = WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar,
350
*lpMultiByteStr, cbMultiByte, lpDefaultChar, lpUsedDefaultChar);
352
if (status != cbMultiByte)