2
This file is part of the KDE libraries
4
Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
5
Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc.
6
Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
8
This library is free software; you can redistribute it and/or
9
modify it under the terms of the GNU Library General Public
10
License as published by the Free Software Foundation; either
11
version 2 of the License, or (at your option) any later version.
13
This library is distributed in the hope that it will be useful,
14
but WITHOUT ANY WARRANTY; without even the implied warranty of
15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
Library General Public License for more details.
18
You should have received a copy of the GNU Library General Public License
19
along with this library; see the file COPYING.LIB. If not, write to
20
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21
Boston, MA 02111-1307, USA.
26
#include "TextResourceDecoder.h"
29
#include "DOMImplementation.h"
30
#include "DeprecatedCString.h"
31
#include "DeprecatedString.h"
32
#include "HTMLNames.h"
33
#include "TextCodec.h"
37
using namespace HTMLNames;
41
enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
42
static enum Type judge(const char* str, int length);
43
static const int ESC = 0x1b;
44
static const unsigned char sjisMap[256];
45
static int ISkanji(int code)
49
return sjisMap[code & 0xff] & 1;
51
static int ISkana(int code)
55
return sjisMap[code & 0xff] & 2;
59
const unsigned char KanjiCode::sjisMap[256] = {
60
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
65
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70
0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
71
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
72
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
73
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
74
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
80
* [0xa1 - 0xfe][0xa1 - 0xfe]
81
* 0x8e[0xa1 - 0xfe](SS2)
82
* 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
85
* [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
87
* Shift_Jis Hankaku Kana is
92
* KanjiCode::judge() is based on judge_jcode() from jvim
93
* http://hp.vector.co.jp/authors/VA003457/vim/
95
* Special Thanks to Kenichi Tsuchida
98
enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
102
int bfr = false; /* Kana Moji */
103
int bfk = 0; /* EUC Kana */
107
const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
113
if (ptr[i] == ESC && (size - i >= 3)) {
114
if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
115
|| (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
118
} else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
119
|| (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
122
} else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
125
} else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
137
/* ?? check kudokuten ?? && ?? hiragana ?? */
138
if ((i >= 2) && (ptr[i - 2] == 0x81)
139
&& (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
141
sjis += 100; /* kudokuten */
142
} else if ((i >= 2) && (ptr[i - 2] == 0xa1)
143
&& (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
145
euc += 100; /* kudokuten */
146
} else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
147
sjis += 40; /* hiragana */
148
} else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
149
euc += 40; /* hiragana */
152
/* ?? check hiragana or katana ?? */
153
if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
154
sjis++; /* hiragana */
155
} else if ((size - i > 1) && (ptr[i] == 0x83)
156
&& (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
157
sjis++; /* katakana */
158
} else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
159
euc++; /* hiragana */
160
} else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
161
euc++; /* katakana */
164
if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
167
} else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
170
} else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
173
} else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
176
} else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
179
} else if (ptr[i] <= 0x7f) {
183
if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
184
euc++; /* sjis hankaku kana kigo */
185
} else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
186
; /* sjis hankaku kana */
187
} else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
189
} else if (0x8e == ptr[i]) {
191
} else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
197
} else if (0x8e == ptr[i]) {
200
} else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
201
/* EUC KANA or SJIS KANJI */
212
} else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
216
&& ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
217
|| (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
220
} else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
224
&& (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
227
} else if (ptr[i] <= 0x7f) {
240
} else if (sjis < euc) {
248
TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
250
if (equalIgnoringCase(mimeType, "text/css"))
252
if (equalIgnoringCase(mimeType, "text/html"))
254
if (DOMImplementation::isXMLMIMEType(mimeType))
259
const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding)
261
// Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
262
// for text/xml. This matches Firefox.
263
if (contentType == XML)
264
return UTF8Encoding();
265
if (!specifiedDefaultEncoding.isValid())
266
return Latin1Encoding();
267
return specifiedDefaultEncoding;
270
TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding)
271
: m_contentType(determineContentType(mimeType))
272
, m_decoder(defaultEncoding(m_contentType, specifiedDefaultEncoding))
273
, m_source(DefaultEncoding)
274
, m_checkedForBOM(false)
275
, m_checkedForCSSCharset(false)
276
, m_checkedForHeadCharset(false)
280
TextResourceDecoder::~TextResourceDecoder()
284
void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source)
286
// In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
287
if (!encoding.isValid())
290
if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
291
m_decoder.reset(encoding.closest8BitEquivalent());
293
m_decoder.reset(encoding);
298
// Returns the position of the encoding string.
299
static int findXMLEncoding(const DeprecatedCString &str, int &encodingLength)
301
int len = str.length();
303
int pos = str.find("encoding");
308
// Skip spaces and stray control characters.
309
while (str[pos] <= ' ' && pos != len)
317
// Skip spaces and stray control characters.
318
while (str[pos] <= ' ' && pos != len)
321
// Skip quotation mark.
322
char quoteMark = str[pos];
323
if (quoteMark != '"' && quoteMark != '\'')
327
// Find the trailing quotation mark.
329
while (str[end] != quoteMark)
335
encodingLength = end - pos;
339
// true if there is more to parse
340
static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
342
while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
344
return pos != dataEnd;
347
void TextResourceDecoder::checkForBOM(const char* data, size_t len)
349
// Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
351
if (m_source == UserChosenEncoding) {
352
// FIXME: Maybe a BOM should override even a user-chosen encoding.
353
m_checkedForBOM = true;
357
// Check if we have enough data.
358
size_t bufferLength = m_buffer.size();
359
if (bufferLength + len < 4)
362
m_checkedForBOM = true;
364
// Extract the first four bytes.
365
// Handle the case where some of bytes are already in the buffer.
366
// The last byte is always guaranteed to not be in the buffer.
367
const unsigned char* udata = reinterpret_cast<const unsigned char*>(data);
368
unsigned char c1 = bufferLength >= 1 ? m_buffer[0] : *udata++;
369
unsigned char c2 = bufferLength >= 2 ? m_buffer[1] : *udata++;
370
unsigned char c3 = bufferLength >= 3 ? m_buffer[2] : *udata++;
371
ASSERT(bufferLength < 4);
372
unsigned char c4 = *udata;
374
// Check for the BOM.
375
if (c1 == 0xFF && c2 == 0xFE) {
376
if (c3 !=0 || c4 != 0)
377
setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
379
setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
381
else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
382
setEncoding(UTF8Encoding(), AutoDetectedEncoding);
383
else if (c1 == 0xFE && c2 == 0xFF)
384
setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
385
else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF)
386
setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
389
bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
391
if (m_source != DefaultEncoding) {
392
m_checkedForCSSCharset = true;
396
size_t oldSize = m_buffer.size();
397
m_buffer.resize(oldSize + len);
398
memcpy(m_buffer.data() + oldSize, data, len);
400
movedDataToBuffer = true;
402
if (m_buffer.size() > 8) { // strlen("@charset") == 8
403
const char* dataStart = m_buffer.data();
404
const char* dataEnd = dataStart + m_buffer.size();
406
if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' &&
407
dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') {
410
const char* pos = dataStart;
411
if (!skipWhitespace(pos, dataEnd))
414
if (*pos == '"' || *pos == '\'') {
415
char quotationMark = *pos;
419
while (pos < dataEnd && *pos != quotationMark)
424
CString encodingName(dataStart, pos - dataStart + 1);
427
if (!skipWhitespace(pos, dataEnd))
431
setEncoding(TextEncoding(encodingName.data()), EncodingFromCSSCharset);
434
m_checkedForCSSCharset = true;
440
// Other browsers allow comments in the head section, so we need to also.
441
// It's important not to look for tags inside the comments.
442
static inline void skipComment(const char*& ptr, const char* pEnd)
445
// Allow <!-->; other browsers do.
451
// This is the real end of comment, "-->".
452
if (p[1] == '-' && p[2] == '>') {
456
// This is the incorrect end of comment that other browsers allow, "--!>".
457
if (p[1] == '-' && p[2] == '!' && p[3] == '>') {
468
bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
470
if (m_source != DefaultEncoding) {
471
m_checkedForHeadCharset = true;
475
// This is not completely efficient, since the function might go
476
// through the HTML head several times.
478
size_t oldSize = m_buffer.size();
479
m_buffer.resize(oldSize + len);
480
memcpy(m_buffer.data() + oldSize, data, len);
482
movedDataToBuffer = true;
484
const char* ptr = m_buffer.data();
485
const char* pEnd = ptr + m_buffer.size();
487
// Is there enough data available to check for XML declaration?
488
if (m_buffer.size() < 8)
491
// Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
492
// It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
493
if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') {
494
const char* xmlDeclarationEnd = ptr;
495
while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
497
if (xmlDeclarationEnd == pEnd)
499
DeprecatedCString str(ptr, xmlDeclarationEnd - ptr); // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
501
int pos = findXMLEncoding(str, len);
503
setEncoding(TextEncoding(str.mid(pos, len)), EncodingFromXMLHeader);
504
// continue looking for a charset - it may be specified in an HTTP-Equiv meta
505
} else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) {
506
setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
508
} else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') {
509
setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
511
} else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) {
512
setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
514
} else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') {
515
setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
519
// we still don't have an encoding, and are in the head
520
// the following tags are allowed in <head>:
521
// SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
523
// We stop scanning when a tag that is not permitted in <head>
524
// is seen, rather when </head> is seen, because that more closely
525
// matches behavior in other browsers; more details in
526
// <http://bugs.webkit.org/show_bug.cgi?id=3590>.
528
// Additionally, we ignore things that looks like tags in <title>, <script> and <noscript>; see
529
// <http://bugs.webkit.org/show_bug.cgi?id=4560>, <http://bugs.webkit.org/show_bug.cgi?id=12165>
530
// and <http://bugs.webkit.org/show_bug.cgi?id=12389>.
532
AtomicStringImpl* enclosingTagName = 0;
534
while (ptr + 3 < pEnd) { // +3 guarantees that "<!--" fits in the buffer - and certainly we aren't going to lose any "charset" that way.
540
if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
542
skipComment(ptr, pEnd);
546
// the HTTP-EQUIV meta has no effect on XHTML
547
if (m_contentType == XML)
555
// Grab the tag name, but mostly ignore namespaces.
556
bool sawNamespace = false;
569
if (c >= 'a' && c <= 'z' || c >= '0' && c <= '9')
571
else if (c >= 'A' && c <= 'Z')
575
tagBuffer[len++] = c;
579
AtomicString tag(tagBuffer);
581
if (enclosingTagName) {
582
if (end && tag.impl() == enclosingTagName)
583
enclosingTagName = 0;
586
enclosingTagName = titleTag.localName().impl();
587
else if (tag == scriptTag)
588
enclosingTagName = scriptTag.localName().impl();
589
else if (tag == noscriptTag)
590
enclosingTagName = noscriptTag.localName().impl();
593
// Find where the opening tag ends.
594
const char* tagContentStart = ptr;
596
while (ptr != pEnd && *ptr != '>') {
597
if (*ptr == '\'' || *ptr == '"') {
598
char quoteMark = *ptr;
600
while (ptr != pEnd && *ptr != quoteMark)
612
if (!end && tag == metaTag && !sawNamespace) {
613
DeprecatedCString str(tagContentStart, ptr - tagContentStart);
616
while (pos < (int)str.length()) {
617
if ((pos = str.find("charset", pos, false)) == -1)
621
while (pos < (int)str.length() && str[pos] <= ' ')
623
if (pos == (int)str.length())
625
if (str[pos++] != '=')
627
while (pos < (int)str.length() &&
628
(str[pos] <= ' ') || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')
632
if (pos == (int)str.length())
634
unsigned endpos = pos;
635
while (endpos < str.length() &&
636
str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' &&
637
str[endpos] != ';' && str[endpos] != '>')
639
setEncoding(TextEncoding(str.mid(pos, endpos - pos)), EncodingFromMetaTag);
640
if (m_source == EncodingFromMetaTag)
643
if (endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>')
648
} else if (tag != scriptTag && tag != noscriptTag && tag != styleTag &&
649
tag != linkTag && tag != metaTag && tag != objectTag &&
650
tag != titleTag && tag != baseTag &&
651
(end || tag != htmlTag) && !enclosingTagName &&
652
(tag != headTag) && isalpha(tagBuffer[0])) {
653
m_checkedForHeadCharset = true;
663
void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
665
switch (KanjiCode::judge(data, len)) {
667
setEncoding("ISO-2022-JP", AutoDetectedEncoding);
670
setEncoding("EUC-JP", AutoDetectedEncoding);
672
case KanjiCode::SJIS:
673
setEncoding("Shift_JIS", AutoDetectedEncoding);
675
case KanjiCode::ASCII:
676
case KanjiCode::UTF16:
677
case KanjiCode::UTF8:
682
String TextResourceDecoder::decode(const char* data, size_t len)
684
if (!m_checkedForBOM)
685
checkForBOM(data, len);
687
bool movedDataToBuffer = false;
689
if (m_contentType == CSS && !m_checkedForCSSCharset)
690
if (!checkForCSSCharset(data, len, movedDataToBuffer))
693
if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
694
if (!checkForHeadCharset(data, len, movedDataToBuffer))
697
// Do the auto-detect if our default encoding is one of the Japanese ones.
698
// FIXME: It seems wrong to change our encoding downstream after we have already done some decoding.
699
if (m_source != UserChosenEncoding && m_source != AutoDetectedEncoding && encoding().isJapanese())
700
detectJapaneseEncoding(data, len);
702
ASSERT(encoding().isValid());
704
if (m_buffer.isEmpty())
705
return m_decoder.decode(data, len);
707
if (!movedDataToBuffer) {
708
size_t oldSize = m_buffer.size();
709
m_buffer.resize(oldSize + len);
710
memcpy(m_buffer.data() + oldSize, data, len);
713
String result = m_decoder.decode(m_buffer.data(), m_buffer.size());
718
String TextResourceDecoder::flush()
720
String result = m_decoder.decode(m_buffer.data(), m_buffer.size(), true);