2
* Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4
* Copyright (C) 2007-2009 Torch Mobile, Inc.
6
* Redistribution and use in source and binary forms, with or without
7
* modification, are permitted provided that the following conditions
9
* 1. Redistributions of source code must retain the above copyright
10
* notice, this list of conditions and the following disclaimer.
11
* 2. Redistributions in binary form must reproduce the above copyright
12
* notice, this list of conditions and the following disclaimer in the
13
* documentation and/or other materials provided with the distribution.
15
* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
16
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
19
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
#include "TextEncoding.h"
31
#include "TextCodec.h"
32
#include "TextEncodingRegistry.h"
33
#include <wtf/OwnPtr.h>
34
#include <wtf/StdLibExtras.h>
35
#include <wtf/text/CString.h>
36
#include <wtf/text/WTFString.h>
39
#include <unicode/unorm.h>
40
#elif USE(QT4_UNICODE)
42
#elif USE(GLIB_UNICODE)
44
#include <wtf/gobject/GOwnPtr.h>
49
static const TextEncoding& UTF7Encoding()
51
static TextEncoding globalUTF7Encoding("UTF-7");
52
return globalUTF7Encoding;
55
TextEncoding::TextEncoding(const char* name)
56
: m_name(atomicCanonicalTextEncodingName(name))
57
, m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
61
TextEncoding::TextEncoding(const String& name)
62
: m_name(atomicCanonicalTextEncodingName(name))
63
, m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
67
String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
72
return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
75
CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
84
// FIXME: What's the right place to do normalization?
85
// It's a little strange to do it inside the encode function.
86
// Perhaps normalization should be an explicit step done before calling encode.
88
const UChar* source = characters;
89
size_t sourceLength = length;
91
Vector<UChar> normalizedCharacters;
93
UErrorCode err = U_ZERO_ERROR;
94
if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
95
// First try using the length of the original string, since normalization to NFC rarely increases length.
96
normalizedCharacters.grow(sourceLength);
97
int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
98
if (err == U_BUFFER_OVERFLOW_ERROR) {
100
normalizedCharacters.resize(normalizedLength);
101
normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
103
ASSERT(U_SUCCESS(err));
105
source = normalizedCharacters.data();
106
sourceLength = normalizedLength;
108
return newTextCodec(*this)->encode(source, sourceLength, handling);
109
#elif USE(QT4_UNICODE)
110
QString str(reinterpret_cast<const QChar*>(characters), length);
111
str = str.normalized(QString::NormalizationForm_C);
112
return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
113
#elif USE(GLIB_UNICODE)
114
GOwnPtr<char> UTF8Source;
115
UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0));
117
// If conversion to UTF-8 failed, try with the string without normalization
118
return newTextCodec(*this)->encode(characters, length, handling);
121
GOwnPtr<char> UTF8Normalized;
122
UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC));
125
GOwnPtr<UChar> UTF16Normalized;
126
UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0));
128
return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling);
129
#elif OS(WINDOWS) && USE(WCHAR_UNICODE)
130
// normalization will be done by Windows CE API
131
OwnPtr<TextCodec> textCodec = newTextCodec(*this);
132
return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
136
const char* TextEncoding::domName() const
138
if (noExtendedTextEncodingNameUsed())
141
// We treat EUC-KR as windows-949 (its superset), but need to expose
142
// the name 'EUC-KR' because the name 'windows-949' is not recognized by
143
// most Korean web servers even though they do use the encoding
144
// 'windows-949' with the name 'EUC-KR'.
145
// FIXME: This is not thread-safe. At the moment, this function is
146
// only accessed in a single thread, but eventually has to be made
147
// thread-safe along with usesVisualOrdering().
148
static const char* const a = atomicCanonicalTextEncodingName("windows-949");
154
bool TextEncoding::usesVisualOrdering() const
156
if (noExtendedTextEncodingNameUsed())
159
static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
163
bool TextEncoding::isJapanese() const
165
return isJapaneseEncoding(m_name);
168
UChar TextEncoding::backslashAsCurrencySymbol() const
170
return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\';
173
bool TextEncoding::isNonByteBasedEncoding() const
175
if (noExtendedTextEncodingNameUsed()) {
176
return *this == UTF16LittleEndianEncoding()
177
|| *this == UTF16BigEndianEncoding();
180
return *this == UTF16LittleEndianEncoding()
181
|| *this == UTF16BigEndianEncoding()
182
|| *this == UTF32BigEndianEncoding()
183
|| *this == UTF32LittleEndianEncoding();
186
bool TextEncoding::isUTF7Encoding() const
188
if (noExtendedTextEncodingNameUsed())
191
return *this == UTF7Encoding();
194
const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
196
if (isNonByteBasedEncoding())
197
return UTF8Encoding();
201
// HTML5 specifies that UTF-8 be used in form submission when a form is
202
// is a part of a document in UTF-16 probably because UTF-16 is not a
203
// byte-based encoding and can contain 0x00. By extension, the same
204
// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
205
// but it's fraught with problems and we'd rather steer clear of it.
206
const TextEncoding& TextEncoding::encodingForFormSubmission() const
208
if (isNonByteBasedEncoding() || isUTF7Encoding())
209
return UTF8Encoding();
213
const TextEncoding& ASCIIEncoding()
215
static TextEncoding globalASCIIEncoding("ASCII");
216
return globalASCIIEncoding;
219
const TextEncoding& Latin1Encoding()
221
static TextEncoding globalLatin1Encoding("latin1");
222
return globalLatin1Encoding;
225
const TextEncoding& UTF16BigEndianEncoding()
227
static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
228
return globalUTF16BigEndianEncoding;
231
const TextEncoding& UTF16LittleEndianEncoding()
233
static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
234
return globalUTF16LittleEndianEncoding;
237
const TextEncoding& UTF32BigEndianEncoding()
239
static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
240
return globalUTF32BigEndianEncoding;
243
const TextEncoding& UTF32LittleEndianEncoding()
245
static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
246
return globalUTF32LittleEndianEncoding;
249
const TextEncoding& UTF8Encoding()
251
static TextEncoding globalUTF8Encoding("UTF-8");
252
ASSERT(globalUTF8Encoding.isValid());
253
return globalUTF8Encoding;
256
const TextEncoding& WindowsLatin1Encoding()
258
static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
259
return globalWindowsLatin1Encoding;
262
} // namespace WebCore