2
This file was taken from the KDE 4.x libraries and backported to Qt 3.
4
Copyright (C) 1999 Lars Knoll (knoll@kde.org)
5
Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
6
Copyright (C) 2003 Apple Computer, Inc.
7
Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
9
This library is free software; you can redistribute it and/or
10
modify it under the terms of the GNU Library General Public
11
License as published by the Free Software Foundation; either
12
version 2 of the License, or (at your option) any later version.
14
This library is distributed in the hope that it will be useful,
15
but WITHOUT ANY WARRANTY; without even the implied warranty of
16
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
Library General Public License for more details.
19
You should have received a copy of the GNU Library General Public License
20
along with this library; see the file COPYING.LIB. If not, write to
21
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22
Boston, MA 02110-1301, USA.
24
//----------------------------------------------------------------------------
26
// decoder for input stream
28
#include "encodingdetector.h"
31
//#define DECODE_DEBUG
33
#define MAX_BUFFER 16*1024
38
#include "encodingdetector_ja_p.h"
41
#include <qtextcodec.h>
44
#include <kcharsets.h>
50
// The following table was taken from libpango 1.19.3 and slightly modified.
51
// Multiple scripts per language were removed and the entries were reordered so
52
// that simple substring matching will work. For example, bam was put before ba
53
// so that the first match will be likely the right match. Otherwise "ba" would
54
// match "bam" but we would have to search on to find "bam" which is what we want.
55
// The original file is called pango-script-lang-table.h
57
/* pango-script-lang-table.h:
59
* Generated by gen-script-for-lang-new.c
61
* Source: fontconfig-2.4.91
63
* Do not edit. // I did. Sue me ;)
65
typedef struct _PangoScriptForLang {
67
EncodingDetector::AutoDetectScript scripts[1];
70
//Unfortunately EncodingDetector does not know all scripts that Pango knows.
71
//Also, using EncodingDetector::CentralEuropean for the appropriate countries
72
//might give better results in some cases.
73
//One especially important (many speakers/literates) omission is the lack of
76
#define PANGO_SCRIPT_ARMENIAN EncodingDetector::None
77
#define PANGO_SCRIPT_BENGALI EncodingDetector::None
78
#define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None
79
#define PANGO_SCRIPT_CHEROKEE EncodingDetector::None
80
#define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None
81
#define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None
82
#define PANGO_SCRIPT_GUJARATI EncodingDetector::None
83
#define PANGO_SCRIPT_GURMUKHI EncodingDetector::None
84
#define PANGO_SCRIPT_KANNADA EncodingDetector::None
85
#define PANGO_SCRIPT_KHMER EncodingDetector::None
86
#define PANGO_SCRIPT_LAO EncodingDetector::None
87
#define PANGO_SCRIPT_MALAYALAM EncodingDetector::None
88
#define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None
89
#define PANGO_SCRIPT_MYANMAR EncodingDetector::None
90
#define PANGO_SCRIPT_ORIYA EncodingDetector::None
91
#define PANGO_SCRIPT_SINHALA EncodingDetector::None
92
#define PANGO_SCRIPT_SYRIAC EncodingDetector::None
93
#define PANGO_SCRIPT_TAGALOG EncodingDetector::None
94
#define PANGO_SCRIPT_TAMIL EncodingDetector::None
95
#define PANGO_SCRIPT_TIBETAN EncodingDetector::None
96
#define PANGO_SCRIPT_TELUGU EncodingDetector::None
98
//Instead of changing the table even more...
99
#define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic
100
#define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic
101
#define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope
102
#define PANGO_SCRIPT_GREEK EncodingDetector::Greek
103
#define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew
104
#define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean
105
#define PANGO_SCRIPT_THAI EncodingDetector::Thai
108
static const PangoScriptForLang pango_script_for_lang[] = {
109
{ "aa", { PANGO_SCRIPT_LATIN/*62*/ } },
110
{ "ab", { PANGO_SCRIPT_CYRILLIC/*90*/ } },
111
{ "af", { PANGO_SCRIPT_LATIN/*69*/ } },
112
{ "am", { PANGO_SCRIPT_ETHIOPIC/*218*/ } },
113
{ "ar", { PANGO_SCRIPT_ARABIC/*125*/ } },
114
{ "as", { PANGO_SCRIPT_BENGALI/*89*/ } },
115
{ "ast", { PANGO_SCRIPT_LATIN/*66*/ } },
116
{ "ava", { PANGO_SCRIPT_CYRILLIC/*67*/ } },
117
{ "ay", { PANGO_SCRIPT_LATIN/*60*/ } },
118
{ "az-ir", { PANGO_SCRIPT_ARABIC/*129*/ } },
119
{ "az", { PANGO_SCRIPT_CYRILLIC/*80*/ } }, //, PANGO_SCRIPT_LATIN/*68*/ } },
120
{ "bam", { PANGO_SCRIPT_LATIN/*60*/ } },
121
{ "ba", { PANGO_SCRIPT_CYRILLIC/*82*/ } },
122
{ "be", { PANGO_SCRIPT_CYRILLIC/*68*/ } },
123
{ "bg", { PANGO_SCRIPT_CYRILLIC/*60*/ } },
124
{ "bh", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
125
{ "bho", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
126
{ "bi", { PANGO_SCRIPT_LATIN/*58*/ } },
127
{ "bin", { PANGO_SCRIPT_LATIN/*76*/ } },
128
{ "bn", { PANGO_SCRIPT_BENGALI/*89*/ } },
129
{ "bo", { PANGO_SCRIPT_TIBETAN/*95*/ } },
130
{ "br", { PANGO_SCRIPT_LATIN/*64*/ } },
131
{ "bs", { PANGO_SCRIPT_LATIN/*62*/ } },
132
{ "bua", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
133
{ "ca", { PANGO_SCRIPT_LATIN/*74*/ } },
134
{ "ce", { PANGO_SCRIPT_CYRILLIC/*67*/ } },
135
{ "chm", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
136
{ "chr", { PANGO_SCRIPT_CHEROKEE/*85*/ } },
137
{ "ch", { PANGO_SCRIPT_LATIN/*58*/ } },
138
{ "co", { PANGO_SCRIPT_LATIN/*84*/ } },
139
{ "cs", { PANGO_SCRIPT_LATIN/*82*/ } },
140
{ "cu", { PANGO_SCRIPT_CYRILLIC/*103*/ } },
141
{ "cv", { PANGO_SCRIPT_CYRILLIC/*72*/ } }, //, PANGO_SCRIPT_LATIN/*2*/ } },
142
{ "cy", { PANGO_SCRIPT_LATIN/*78*/ } },
143
{ "da", { PANGO_SCRIPT_LATIN/*70*/ } },
144
{ "de", { PANGO_SCRIPT_LATIN/*59*/ } },
145
{ "dz", { PANGO_SCRIPT_TIBETAN/*95*/ } },
146
{ "el", { PANGO_SCRIPT_GREEK/*69*/ } },
147
{ "en", { PANGO_SCRIPT_LATIN/*72*/ } },
148
{ "eo", { PANGO_SCRIPT_LATIN/*64*/ } },
149
{ "es", { PANGO_SCRIPT_LATIN/*66*/ } },
150
// { "et", { PANGO_SCRIPT_LATIN/*64*/ } },
151
{ "et", { EncodingDetector::Baltic } },
152
{ "eu", { PANGO_SCRIPT_LATIN/*56*/ } },
153
{ "fa", { PANGO_SCRIPT_ARABIC/*129*/ } },
154
{ "fi", { PANGO_SCRIPT_LATIN/*62*/ } },
155
{ "fj", { PANGO_SCRIPT_LATIN/*52*/ } },
156
{ "fo", { PANGO_SCRIPT_LATIN/*68*/ } },
157
{ "fr", { PANGO_SCRIPT_LATIN/*84*/ } },
158
{ "ful", { PANGO_SCRIPT_LATIN/*62*/ } },
159
{ "fur", { PANGO_SCRIPT_LATIN/*66*/ } },
160
{ "fy", { PANGO_SCRIPT_LATIN/*75*/ } },
161
{ "ga", { PANGO_SCRIPT_LATIN/*80*/ } },
162
{ "gd", { PANGO_SCRIPT_LATIN/*70*/ } },
163
{ "gez", { PANGO_SCRIPT_ETHIOPIC/*218*/ } },
164
{ "gl", { PANGO_SCRIPT_LATIN/*66*/ } },
165
{ "gn", { PANGO_SCRIPT_LATIN/*70*/ } },
166
{ "gu", { PANGO_SCRIPT_GUJARATI/*78*/ } },
167
{ "gv", { PANGO_SCRIPT_LATIN/*54*/ } },
168
{ "ha", { PANGO_SCRIPT_LATIN/*60*/ } },
169
{ "haw", { PANGO_SCRIPT_LATIN/*62*/ } },
170
{ "he", { PANGO_SCRIPT_HEBREW/*27*/ } },
171
{ "hi", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
172
{ "ho", { PANGO_SCRIPT_LATIN/*52*/ } },
173
{ "hr", { PANGO_SCRIPT_LATIN/*62*/ } },
174
{ "hu", { PANGO_SCRIPT_LATIN/*70*/ } },
175
{ "hy", { PANGO_SCRIPT_ARMENIAN/*77*/ } },
176
{ "ia", { PANGO_SCRIPT_LATIN/*52*/ } },
177
{ "ibo", { PANGO_SCRIPT_LATIN/*58*/ } },
178
{ "id", { PANGO_SCRIPT_LATIN/*54*/ } },
179
{ "ie", { PANGO_SCRIPT_LATIN/*52*/ } },
180
{ "ik", { PANGO_SCRIPT_CYRILLIC/*68*/ } },
181
{ "io", { PANGO_SCRIPT_LATIN/*52*/ } },
182
{ "is", { PANGO_SCRIPT_LATIN/*70*/ } },
183
{ "it", { PANGO_SCRIPT_LATIN/*72*/ } },
184
{ "iu", { PANGO_SCRIPT_CANADIAN_ABORIGINAL/*161*/ } },
185
// { "ja", { PANGO_SCRIPT_HAN/*6356*/, PANGO_SCRIPT_KATAKANA/*88*/, PANGO_SCRIPT_HIRAGANA/*85*/ } },
186
{ "ja", { EncodingDetector::Japanese } },
187
{ "kaa", { PANGO_SCRIPT_CYRILLIC/*78*/ } },
188
{ "ka", { PANGO_SCRIPT_GEORGIAN/*33*/ } },
189
{ "ki", { PANGO_SCRIPT_LATIN/*56*/ } },
190
{ "kk", { PANGO_SCRIPT_CYRILLIC/*77*/ } },
191
{ "kl", { PANGO_SCRIPT_LATIN/*81*/ } },
192
{ "km", { PANGO_SCRIPT_KHMER/*70*/ } },
193
{ "kn", { PANGO_SCRIPT_KANNADA/*80*/ } },
194
// { "ko", { PANGO_SCRIPT_HANGUL/*2443*/ } },
195
{ "ko", { EncodingDetector::Korean } },
196
{ "kok", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
197
{ "ks", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
198
{ "ku-ir", { PANGO_SCRIPT_ARABIC/*32*/ } },
199
{ "ku", { PANGO_SCRIPT_CYRILLIC/*60*/ } }, //, PANGO_SCRIPT_LATIN/*4*/ } },
200
{ "kum", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
201
{ "kv", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
202
{ "kw", { PANGO_SCRIPT_LATIN/*64*/ } },
203
{ "ky", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
204
{ "la", { PANGO_SCRIPT_LATIN/*68*/ } },
205
{ "lb", { PANGO_SCRIPT_LATIN/*75*/ } },
206
{ "lez", { PANGO_SCRIPT_CYRILLIC/*67*/ } },
207
{ "ln", { PANGO_SCRIPT_LATIN/*78*/ } },
208
{ "lo", { PANGO_SCRIPT_LAO/*65*/ } },
209
// { "lt", { PANGO_SCRIPT_LATIN/*70*/ } },
210
{ "lt", { EncodingDetector::Baltic } },
211
// { "lv", { PANGO_SCRIPT_LATIN/*78*/ } },
212
{ "lv", { EncodingDetector::Baltic } },
213
{ "mg", { PANGO_SCRIPT_LATIN/*56*/ } },
214
{ "mh", { PANGO_SCRIPT_LATIN/*62*/ } },
215
{ "mi", { PANGO_SCRIPT_LATIN/*64*/ } },
216
{ "mk", { PANGO_SCRIPT_CYRILLIC/*42*/ } },
217
{ "ml", { PANGO_SCRIPT_MALAYALAM/*78*/ } },
218
{ "mn", { PANGO_SCRIPT_MONGOLIAN/*130*/ } },
219
{ "mo", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, //, PANGO_SCRIPT_LATIN/*62*/ } },
220
{ "mr", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
221
{ "mt", { PANGO_SCRIPT_LATIN/*72*/ } },
222
{ "my", { PANGO_SCRIPT_MYANMAR/*48*/ } },
223
{ "nb", { PANGO_SCRIPT_LATIN/*70*/ } },
224
{ "nds", { PANGO_SCRIPT_LATIN/*59*/ } },
225
{ "ne", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
226
{ "nl", { PANGO_SCRIPT_LATIN/*82*/ } },
227
{ "nn", { PANGO_SCRIPT_LATIN/*76*/ } },
228
{ "no", { PANGO_SCRIPT_LATIN/*70*/ } },
229
{ "nr", { PANGO_SCRIPT_LATIN/*52*/ } },
230
{ "nso", { PANGO_SCRIPT_LATIN/*58*/ } },
231
{ "ny", { PANGO_SCRIPT_LATIN/*54*/ } },
232
{ "oc", { PANGO_SCRIPT_LATIN/*70*/ } },
233
{ "om", { PANGO_SCRIPT_LATIN/*52*/ } },
234
{ "or", { PANGO_SCRIPT_ORIYA/*79*/ } },
235
{ "os", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
236
{ "pa", { PANGO_SCRIPT_GURMUKHI/*63*/ } },
237
{ "pl", { PANGO_SCRIPT_LATIN/*70*/ } },
238
{ "ps-af", { PANGO_SCRIPT_ARABIC/*49*/ } },
239
{ "ps-pk", { PANGO_SCRIPT_ARABIC/*49*/ } },
240
{ "pt", { PANGO_SCRIPT_LATIN/*82*/ } },
241
{ "rm", { PANGO_SCRIPT_LATIN/*66*/ } },
242
{ "ro", { PANGO_SCRIPT_LATIN/*62*/ } },
243
{ "ru", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
244
{ "sah", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
245
{ "sa", { PANGO_SCRIPT_DEVANAGARI/*68*/ } },
246
{ "sco", { PANGO_SCRIPT_LATIN/*56*/ } },
247
{ "sel", { PANGO_SCRIPT_CYRILLIC/*66*/ } },
248
{ "se", { PANGO_SCRIPT_LATIN/*66*/ } },
249
{ "sh", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
250
{ "si", { PANGO_SCRIPT_SINHALA/*77*/ } },
251
{ "sk", { PANGO_SCRIPT_LATIN/*86*/ } },
252
{ "sl", { PANGO_SCRIPT_LATIN/*62*/ } },
253
{ "sma", { PANGO_SCRIPT_LATIN/*60*/ } },
254
{ "smj", { PANGO_SCRIPT_LATIN/*60*/ } },
255
{ "smn", { PANGO_SCRIPT_LATIN/*68*/ } },
256
{ "sms", { PANGO_SCRIPT_LATIN/*80*/ } },
257
{ "sm", { PANGO_SCRIPT_LATIN/*52*/ } },
258
{ "so", { PANGO_SCRIPT_LATIN/*52*/ } },
259
{ "sq", { PANGO_SCRIPT_LATIN/*56*/ } },
260
{ "sr", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
261
{ "ss", { PANGO_SCRIPT_LATIN/*52*/ } },
262
{ "st", { PANGO_SCRIPT_LATIN/*52*/ } },
263
{ "sv", { PANGO_SCRIPT_LATIN/*68*/ } },
264
{ "sw", { PANGO_SCRIPT_LATIN/*52*/ } },
265
{ "syr", { PANGO_SCRIPT_SYRIAC/*45*/ } },
266
{ "ta", { PANGO_SCRIPT_TAMIL/*48*/ } },
267
{ "te", { PANGO_SCRIPT_TELUGU/*80*/ } },
268
{ "tg", { PANGO_SCRIPT_CYRILLIC/*78*/ } },
269
{ "th", { PANGO_SCRIPT_THAI/*86*/ } },
270
{ "ti-er", { PANGO_SCRIPT_ETHIOPIC/*255*/ } },
271
{ "ti-et", { PANGO_SCRIPT_ETHIOPIC/*255*/ } },
272
{ "tig", { PANGO_SCRIPT_ETHIOPIC/*221*/ } },
273
{ "tk", { PANGO_SCRIPT_CYRILLIC/*74*/ } },
274
{ "tl", { PANGO_SCRIPT_TAGALOG/*19*/ } },
275
{ "tn", { PANGO_SCRIPT_LATIN/*58*/ } },
276
{ "to", { PANGO_SCRIPT_LATIN/*52*/ } },
277
// { "tr", { PANGO_SCRIPT_LATIN/*70*/ } },
278
{ "tr", { EncodingDetector::Turkish } },
279
{ "ts", { PANGO_SCRIPT_LATIN/*52*/ } },
280
{ "tt", { PANGO_SCRIPT_CYRILLIC/*76*/ } },
281
{ "tw", { PANGO_SCRIPT_LATIN/*70*/ } },
282
{ "tyv", { PANGO_SCRIPT_CYRILLIC/*70*/ } },
283
{ "ug", { PANGO_SCRIPT_ARABIC/*125*/ } },
284
{ "uk", { PANGO_SCRIPT_CYRILLIC/*72*/ } },
285
{ "ur", { PANGO_SCRIPT_ARABIC/*145*/ } },
286
{ "uz", { PANGO_SCRIPT_CYRILLIC/*68*/ } },
287
{ "ven", { PANGO_SCRIPT_LATIN/*62*/ } },
288
{ "vi", { PANGO_SCRIPT_LATIN/*186*/ } },
289
{ "vot", { PANGO_SCRIPT_LATIN/*62*/ } },
290
{ "vo", { PANGO_SCRIPT_LATIN/*54*/ } },
291
{ "wa", { PANGO_SCRIPT_LATIN/*70*/ } },
292
{ "wen", { PANGO_SCRIPT_LATIN/*76*/ } },
293
{ "wo", { PANGO_SCRIPT_LATIN/*66*/ } },
294
{ "xh", { PANGO_SCRIPT_LATIN/*52*/ } },
295
{ "yap", { PANGO_SCRIPT_LATIN/*58*/ } },
296
{ "yi", { PANGO_SCRIPT_HEBREW/*27*/ } },
297
{ "yo", { PANGO_SCRIPT_LATIN/*114*/ } },
298
// { "zh-cn", { PANGO_SCRIPT_HAN/*6763*/ } },
299
{ "zh-cn", { EncodingDetector::ChineseSimplified } },
300
// { "zh-hk", { PANGO_SCRIPT_HAN/*2213*/ } },
301
{ "zh-hk", { EncodingDetector::ChineseTraditional } },
302
// { "zh-mo", { PANGO_SCRIPT_HAN/*2213*/ } },
303
{ "zh-mo", { EncodingDetector::ChineseTraditional } },
304
// { "zh-sg", { PANGO_SCRIPT_HAN/*6763*/ } },
305
{ "zh-sg", { EncodingDetector::ChineseSimplified } },
306
// { "zh-tw", { PANGO_SCRIPT_HAN/*13063*/ } },
307
{ "zh-tw", { EncodingDetector::ChineseTraditional } },
308
{ "zu", { PANGO_SCRIPT_LATIN/*52*/ } },
309
{ "\x00", { EncodingDetector::None } } //end mark
323
static bool is16Bit(QTextCodec* codec)
325
switch (codec->mibEnum())
337
class EncodingDetectorPrivate
341
QTextDecoder *m_decoder; // utf16
342
QTextCodec *m_defaultCodec;
343
QCString m_storeDecoderName;
345
EncodingDetector::EncodingChoiceSource m_source;
346
EncodingDetector::AutoDetectScript m_autoDetectLanguage;
348
bool m_visualRTL : 1;
350
bool m_writtingHappened : 1;
351
bool m_analyzeCalled : 1; //for decode()
354
QCString m_bufferForDefferedEncDetection;
356
EncodingDetectorPrivate()
357
: m_codec(QTextCodec::codecForMib(MibLatin1))
358
, m_decoder(m_codec->makeDecoder())
359
, m_defaultCodec(m_codec)
360
, m_source(EncodingDetector::DefaultEncoding)
361
, m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection)
364
, m_writtingHappened(false)
365
, m_analyzeCalled(false)
370
EncodingDetectorPrivate(QTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script)
372
, m_decoder(m_codec->makeDecoder())
373
, m_defaultCodec(m_codec)
375
, m_autoDetectLanguage(script)
378
, m_writtingHappened(false)
379
, m_analyzeCalled(false)
384
~EncodingDetectorPrivate()
391
static QCString automaticDetectionForArabic( const unsigned char* ptr, int size )
393
for ( int i = 0; i < size; ++i ) {
394
if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
395
|| ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
396
|| ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
397
|| ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
405
static QCString automaticDetectionForBaltic( const unsigned char* ptr, int size )
407
for ( int i = 0; i < size; ++i ) {
408
if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
411
if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
412
return "iso-8859-13";
415
return "iso-8859-13";
418
static QCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
421
for ( int i = 0; i < size; ++i ) {
422
if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
423
if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
428
else { // maybe ibm852 ?
433
if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
436
else { // maybe ibm852 ?
437
if ( charset.isNull() )
438
charset = "iso-8859-2";
444
if ( charset.isNull() )
445
charset = "iso-8859-3";
447
return charset.data();
450
static QCString automaticDetectionForCyrillic( const unsigned char* ptr, int size)
453
kWarning() << "EncodingDetector: Cyr heuristics";
456
// if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
470
int cp1251_o_capital=0;
475
int cp1251_a_capital=0;
480
int cp1251_s_capital=0;
485
int cp1251_i_capital=0;
488
int cp1251_small_range=0;
489
int koi_small_range=0;
490
int ibm866_small_range=0;
493
for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
497
++cp1251_small_range;
499
if (ptr[i]==0xee)//small o
501
else if (ptr[i]==0xe0)//small a
503
else if (ptr[i]==0xe8)//small i
505
else if (ptr[i]==0xf1)//small s
507
else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st
510
else if (ptr[i]==0xef)
512
else if (ptr[i]==0xe1)
514
else if (ptr[i]==0xe9)
516
else if (ptr[i]==0xf3)
520
else if (ptr[i]>0xbf)
524
if (ptr[i]==0xd0||ptr[i]==0xd1)//small o
526
else if (ptr[i]==0xcf)//small o
528
else if (ptr[i]==0xc1)//small a
530
else if (ptr[i]==0xc9)//small i
532
else if (ptr[i]==0xd3)//small s
534
else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st
537
else if (ptr[i]==0xce)
539
else if (ptr[i]==0xc0)
541
else if (ptr[i]==0xc8)
543
else if (ptr[i]==0xd1)
546
else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60%
547
++ibm866_small_range;
552
if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
557
if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
560
kWarning() << "Cyr Enc Detection: UTF8";
565
if (ibm866_small_range>cp1251_small_range+koi_small_range)
568
// QCString koi_string = "koi8-u";
569
// QCString cp1251_string = "cp1251";
571
if (cp1251_st==0 && koi_st>1)
573
else if (koi_st==0 && cp1251_st>1)
576
if (cp1251_st && koi_st)
578
if (cp1251_st/koi_st>2)
580
else if (koi_st/cp1251_st>2)
586
else if (cp1251_a || koi_a)
591
else if (cp1251_o || koi_o)
596
else if (cp1251_i || koi_i)
601
else if (cp1251_s || koi_s)
604
if (cp1251_a_capital>koi_a_capital)
606
else if (cp1251_a_capital || koi_a_capital)
609
if (cp1251_o_capital>koi_o_capital)
611
else if (cp1251_o_capital || koi_o_capital)
614
if (cp1251_i_capital>koi_i_capital)
616
else if (cp1251_i_capital || koi_i_capital)
619
if (cp1251_s_capital>koi_s_capital)
621
else if (cp1251_s_capital || koi_s_capital)
624
kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
626
if (abs(koi_score-cp1251_score)<10)
629
cp1251_score=cp1251_small_range;
630
koi_score=koi_small_range;
632
if (cp1251_score>koi_score)
638
// if (cp1251_score>koi_score)
639
// setEncoding("cp1251",AutoDetectedEncoding);
641
// setEncoding("koi8-u",AutoDetectedEncoding);
646
static QCString automaticDetectionForGreek( const unsigned char* ptr, int size )
648
for ( int i = 0; i < size; ++i ) {
649
if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
650
|| ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
651
|| ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
659
static QCString automaticDetectionForHebrew( const unsigned char* ptr, int size )
661
for ( int i = 0; i < size; ++i ) {
662
if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
663
|| ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
664
|| ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
668
if ( ptr[ i ] == 0xDF )
669
return "iso-8859-8-i";
672
return "iso-8859-8-i";
675
static QCString automaticDetectionForJapanese( const unsigned char* ptr, int size )
679
switch ( kc.guess_jp( (const char*)ptr, size ) ) {
680
case JapaneseCode::JIS:
682
case JapaneseCode::EUC:
684
case JapaneseCode::SJIS:
686
case JapaneseCode::UTF8:
695
static QCString automaticDetectionForTurkish( const unsigned char* ptr, int size )
697
for ( int i = 0; i < size; ++i ) {
698
if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
706
static QCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
708
uint nonansi_count=0;
709
for (int i=0; i<size; ++i)
714
if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
718
if (ptr[i] >= 0x78 && ptr[i] <= 0x9 )
727
return "iso-8859-15";
732
// Other browsers allow comments in the head section, so we need to also.
733
// It's important not to look for tags inside the comments.
734
static void skipComment(const char *&ptr, const char *pEnd)
737
// Allow <!-->; other browsers do.
748
// This is the real end of comment, "-->".
749
if (p[1]=='-' && p[2]=='>')
754
// This is the incorrect end of comment that other browsers allow, "--!>".
755
if (p[1] == '-' && p[2] == '!' && p[3] == '>')
767
// Returns the position of the encoding string.
768
static int findXMLEncoding(const QCString &str, int &encodingLength)
770
int len = str.length();
771
int pos = str.find("encoding");
776
// Skip spaces and stray control characters.
777
while (pos<len && str[pos]<=' ')
780
//Bail out if nothing after
782
if (pos>=len || str[pos] != '=')
786
// Skip spaces and stray control characters.
787
while (pos<len && str[pos]<=' ')
790
//Bail out if nothing after
794
// Skip quotation mark.
795
char quoteMark = str[pos];
796
if (quoteMark != '"' && quoteMark != '\'')
800
// Find the trailing quotation mark.
802
while (end<len && str[end]!=quoteMark)
808
encodingLength = end-pos;
813
bool EncodingDetector::errorsIfUtf8 (const char* data, int length)
815
if (d->m_codec->mibEnum()!=MibUtf8)
816
return false; //means no errors
817
// #define highest1Bits (unsigned char)0x80
818
// #define highest2Bits (unsigned char)0xC0
819
// #define highest3Bits (unsigned char)0xE0
820
// #define highest4Bits (unsigned char)0xF0
821
// #define highest5Bits (unsigned char)0xF8
822
static const unsigned char highest1Bits = 0x80;
823
static const unsigned char highest2Bits = 0xC0;
824
static const unsigned char highest3Bits = 0xE0;
825
static const unsigned char highest4Bits = 0xF0;
826
static const unsigned char highest5Bits = 0xF8;
828
for (int i=0; i<length; ++i)
830
unsigned char c = data[i];
832
if (d->m_multiByte>0)
834
if ((c & highest2Bits) == 0x80)
840
kWarning() << "EncDetector: Broken UTF8";
845
// most significant bit zero, single char
846
if ((c & highest1Bits) == 0x00)
849
// 110xxxxx => init 1 following bytes
850
if ((c & highest3Bits) == 0xC0)
856
// 1110xxxx => init 2 following bytes
857
if ((c & highest4Bits) == 0xE0)
863
// 11110xxx => init 3 following bytes
864
if ((c & highest5Bits) == 0xF0)
870
kWarning() << "EncDetector:_Broken UTF8";
877
EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate)
881
EncodingDetector::EncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
882
d(new EncodingDetectorPrivate(codec,source,script))
886
EncodingDetector::~EncodingDetector()
891
void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang)
893
d->m_autoDetectLanguage=lang;
895
EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const
897
return d->m_autoDetectLanguage;
900
EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const
905
const char* EncodingDetector::encoding() const
907
d->m_storeDecoderName = d->m_codec->name();
908
return d->m_storeDecoderName.data();
911
bool EncodingDetector::visuallyOrdered() const
913
return d->m_visualRTL;
916
// const QTextCodec* EncodingDetector::codec() const
918
// return d->m_codec;
921
QTextDecoder* EncodingDetector::decoder()
926
bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
929
QCString enc(_encoding);
930
if(/*enc.isNull() || */enc.isEmpty())
932
if (type==DefaultEncoding)
933
codec=d->m_defaultCodec;
939
//QString->QTextCodec
942
// hebrew visually ordered
946
codec = KGlobal::charsets()->codecForName(enc, b);
951
if (d->m_codec->mibEnum()==codec->mibEnum())
954
if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
956
//Sometimes the codec specified is absurd, i.e. UTF-16 despite
957
//us decoding a meta tag as ASCII. In that case, ignore it.
961
if (codec->mibEnum() == Mib8859_8)
963
//We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.
964
codec = QTextCodec::codecForName("iso8859-8-i");
966
// visually ordered unless one of the following
967
if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
968
d->m_visualRTL = true;
974
d->m_decoder = d->m_codec->makeDecoder();
976
kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name();
981
bool EncodingDetector::analyze(const QByteArray &data)
983
return analyze( data.data(), data.size() );
986
bool EncodingDetector::analyze(const char *data, int len)
988
// Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
989
// maximumBOMLength = 10
990
// Even if the user has chosen utf16 we still need to auto-detect the endianness
991
if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
993
// Extract the first three bytes.
994
const uchar *udata = (const uchar *)data;
1000
const char *autoDetectedEncoding;
1001
if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
1003
autoDetectedEncoding = "ISO-10646-UCS-2";
1005
else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
1007
autoDetectedEncoding = "UTF-8";
1009
else if (c1 == 0x00 || c2 == 0x00)
1011
uchar c4 = *udata++;
1012
uchar c5 = *udata++;
1013
uchar c6 = *udata++;
1014
uchar c7 = *udata++;
1015
uchar c8 = *udata++;
1016
uchar c9 = *udata++;
1017
uchar c10 = *udata++;
1019
int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
1020
int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
1021
if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
1022
autoDetectedEncoding = "ISO-10646-UCS-2";
1024
autoDetectedEncoding = 0;
1028
autoDetectedEncoding = 0;
1031
// If we found a BOM, use the encoding it implies.
1032
if (autoDetectedEncoding != 0)
1035
d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
1037
//enc = d->m_codec->name();
1038
delete d->m_decoder;
1039
d->m_decoder = d->m_codec->makeDecoder();
1041
kWarning() << "Detection by BOM";
1043
if (is16Bit(d->m_codec) && c2==0x00)
1045
// utf16LE, we need to put the decoder in LE mode
1046
char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
1047
d->m_decoder->toUnicode(reverseUtf16, 2);
1053
//exit from routine in case it was called to only detect byte order for utf-16
1054
if (d->m_source==UserChosenEncoding)
1057
kWarning() << "EncodingDetector: UserChosenEncoding exit ";
1060
if (errorsIfUtf8(data, len))
1061
setEncoding("",DefaultEncoding);
1064
#if 0 //This is for plaintext, so don't try to parse HTML headers -- ahartmetz
1067
// we still don't have an encoding, and are in the head
1068
// the following tags are allowed in <head>:
1069
// SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
1070
const char *ptr = data;
1071
const char *pEnd = data+len;
1082
if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
1085
skipComment(ptr, pEnd);
1089
// Handle XML header, which can have encoding in it.
1090
if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
1092
const char *end = ptr;
1093
while (*end != '>' && end < pEnd)
1095
if (*end == '\0' || end == pEnd)
1097
QCString str(ptr, end - ptr + 1);
1099
int pos = findXMLEncoding(str, length);
1100
// also handles the case when specified encoding aint correct
1101
if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
1107
//look for <meta>, stop if we reach <body>
1109
!((*ptr >= 'a') && (*ptr <= 'z') ||
1110
(*ptr >= 'A') && (*ptr <= 'Z'))
1117
const char* max=ptr+4;
1121
((*ptr >= 'a') && (*ptr <= 'z') ||
1122
(*ptr >= 'A') && (*ptr <= 'Z') ||
1123
(*ptr >= '0') && (*ptr <= '9'))
1127
tmp[length] = tolower( *ptr );
1132
if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
1134
// found a meta tag...
1135
const char* end = ptr;
1136
while(*end != '>' && *end != '\0' && end<pEnd)
1138
//if ( *end == '\0' ) break;
1139
QCString str( ptr, (end-ptr)+1);
1142
//if( (pos = str.find("http-equiv", pos)) == -1) break;
1143
//if( (pos = str.find("content-type", pos)) == -1) break;
1144
if( (pos = str.find("charset")) == -1)
1148
if( (pos = str.find('=', pos)) == -1)
1151
// skip whitespace before encoding itself
1152
while (pos < (int)str.length() && str[pos] <= ' ')
1154
if ( pos == (int)str.length())
1158
while( endpos < str.length() &&
1159
(str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
1160
&& str[endpos] != ';' && str[endpos] != '>') )
1163
kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
1165
if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
1168
else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
1176
if (d->m_source==EncodingFromHTTPHeader)
1179
//if (len<20) //make a guess even if the file is short -- ahartmetz
1182
setEncoding("",DefaultEncoding);
1186
kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")";
1189
switch ( d->m_autoDetectLanguage )
1191
case EncodingDetector::Arabic:
1192
return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
1194
case EncodingDetector::Baltic:
1195
return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
1197
case EncodingDetector::CentralEuropean:
1198
return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
1200
case EncodingDetector::Cyrillic:
1201
return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
1203
case EncodingDetector::Greek:
1204
return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
1206
case EncodingDetector::Hebrew:
1207
return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
1209
case EncodingDetector::Japanese:
1210
return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
1212
case EncodingDetector::Turkish:
1213
return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
1215
case EncodingDetector::WesternEuropean:
1216
if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
1218
else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for khtml
1220
return setEncoding("iso-8859-15",AutoDetectedEncoding);
1222
else //use default provided by eg katepart
1224
return setEncoding("",DefaultEncoding);
1227
case EncodingDetector::SemiautomaticDetection:
1228
case EncodingDetector::ChineseSimplified:
1229
case EncodingDetector::ChineseTraditional:
1230
case EncodingDetector::Korean:
1231
case EncodingDetector::Thai:
1232
case EncodingDetector::Unicode:
1233
case EncodingDetector::NorthernSaami:
1234
case EncodingDetector::SouthEasternEurope:
1235
case EncodingDetector::None:
1236
// huh. somethings broken in this code ### FIXME
1237
//enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
1241
setEncoding("",DefaultEncoding);
1246
EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const QString& lang)
1249
return EncodingDetector::None;
1250
else if (lang==i18n("@item Text character set", "Unicode"))
1251
return EncodingDetector::Unicode;
1252
else if (lang==i18n("@item Text character set", "Cyrillic"))
1253
return EncodingDetector::Cyrillic;
1254
else if (lang==i18n("@item Text character set", "Western European"))
1255
return EncodingDetector::WesternEuropean;
1256
else if (lang==i18n("@item Text character set", "Central European"))
1257
return EncodingDetector::CentralEuropean;
1258
else if (lang==i18n("@item Text character set", "Greek"))
1259
return EncodingDetector::Greek;
1260
else if (lang==i18n("@item Text character set", "Hebrew"))
1261
return EncodingDetector::Hebrew;
1262
else if (lang==i18n("@item Text character set", "Turkish"))
1263
return EncodingDetector::Turkish;
1264
else if (lang==i18n("@item Text character set", "Japanese"))
1265
return EncodingDetector::Japanese;
1266
else if (lang==i18n("@item Text character set", "Baltic"))
1267
return EncodingDetector::Baltic;
1268
else if (lang==i18n("@item Text character set", "Arabic"))
1269
return EncodingDetector::Arabic;
1271
return EncodingDetector::None;
1274
bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script)
1278
case EncodingDetector::Arabic:
1280
case EncodingDetector::Baltic:
1282
case EncodingDetector::CentralEuropean:
1284
case EncodingDetector::Cyrillic:
1286
case EncodingDetector::Greek:
1288
case EncodingDetector::Hebrew:
1290
case EncodingDetector::Japanese:
1292
case EncodingDetector::Turkish:
1294
case EncodingDetector::WesternEuropean:
1296
case EncodingDetector::ChineseTraditional:
1298
case EncodingDetector::ChineseSimplified:
1300
case EncodingDetector::Unicode:
1308
QString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script)
1312
case EncodingDetector::Arabic:
1313
return i18n("@item Text character set", "Arabic");
1315
case EncodingDetector::Baltic:
1316
return i18n("@item Text character set", "Baltic");
1318
case EncodingDetector::CentralEuropean:
1319
return i18n("@item Text character set", "Central European");
1321
case EncodingDetector::Cyrillic:
1322
return i18n("@item Text character set", "Cyrillic");
1324
case EncodingDetector::Greek:
1325
return i18n("@item Text character set", "Greek");
1327
case EncodingDetector::Hebrew:
1328
return i18n("@item Text character set", "Hebrew");
1330
case EncodingDetector::Japanese:
1331
return i18n("@item Text character set", "Japanese");
1333
case EncodingDetector::Turkish:
1334
return i18n("@item Text character set", "Turkish");
1336
case EncodingDetector::WesternEuropean:
1337
return i18n("@item Text character set", "Western European");
1339
case EncodingDetector::ChineseTraditional:
1340
return i18n("@item Text character set", "Chinese Traditional");
1342
case EncodingDetector::ChineseSimplified:
1343
return i18n("@item Text character set", "Chinese Simplified");
1345
case EncodingDetector::Korean:
1346
return i18n("@item Text character set", "Korean");
1348
case EncodingDetector::Thai:
1349
return i18n("@item Text character set", "Thai");
1351
case EncodingDetector::Unicode:
1352
return i18n("@item Text character set", "Unicode");
1354
//case EncodingDetector::SemiautomaticDetection:
1361
EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const QString &lc)
1363
// It might make sense to do something special if the locale ends with
1364
// ".UTF-8" or "@utf8"
1365
const char *langStr = pango_script_for_lang[0].lang;
1366
// There is obvious optimization potential...
1367
for ( int i = 0; langStr; i++ ) {
1368
langStr = pango_script_for_lang[i].lang;
1369
// startsWith() works for empty strings: every string "starts with" an empty string.
1370
if ( lc.startsWith( QString::fromAscii( langStr ) ) )
1371
return pango_script_for_lang[i].scripts[0];