1
######################## BEGIN LICENSE BLOCK ########################
2
# The Original Code is Mozilla Universal charset detector code.
4
# The Initial Developer of the Original Code is
5
# Netscape Communications Corporation.
6
# Portions created by the Initial Developer are Copyright (C) 2001
7
# the Initial Developer. All Rights Reserved.
10
# Mark Pilgrim - port to Python
11
# Shy Shalom - original C code
13
# This library is free software; you can redistribute it and/or
14
# modify it under the terms of the GNU Lesser General Public
15
# License as published by the Free Software Foundation; either
16
# version 2.1 of the License, or (at your option) any later version.
18
# This library is distributed in the hope that it will be useful,
19
# but WITHOUT ANY WARRANTY; without even the implied warranty of
20
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
# Lesser General Public License for more details.
23
# You should have received a copy of the GNU Lesser General Public
24
# License along with this library; if not, write to the Free Software
25
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
######################### END LICENSE BLOCK #########################
29
from charsetprober import CharSetProber
37
ASC = 2 # ascii capital letter
38
ASS = 3 # ascii small letter
39
ACV = 4 # accent capital vowel
40
ACO = 5 # accent capital other
41
ASV = 6 # accent small vowel
42
ASO = 7 # accent small other
43
CLASS_NUM = 8 # total classes
45
Latin1_CharToClass = ( \
46
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
47
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
48
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
49
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
50
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
51
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
52
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
53
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
54
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
55
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
56
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
58
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
59
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
60
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
61
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
62
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87
63
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F
64
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97
65
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F
66
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7
67
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF
68
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
69
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF
70
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7
71
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF
72
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7
73
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF
74
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7
75
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF
76
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7
77
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF
84
Latin1ClassModel = ( \
85
# UDF OTH ASC ASS ACV ACO ASV ASO
86
0, 0, 0, 0, 0, 0, 0, 0, # UDF
87
0, 3, 3, 3, 3, 3, 3, 3, # OTH
88
0, 3, 3, 3, 3, 3, 3, 3, # ASC
89
0, 3, 3, 3, 1, 1, 3, 3, # ASS
90
0, 3, 3, 3, 1, 2, 1, 2, # ACV
91
0, 3, 3, 3, 3, 3, 3, 3, # ACO
92
0, 3, 1, 3, 1, 1, 1, 3, # ASV
93
0, 3, 1, 3, 1, 1, 3, 3, # ASO
96
class Latin1Prober(CharSetProber):
98
CharSetProber.__init__(self)
102
self._mLastCharClass = OTH
103
self._mFreqCounter = [0] * FREQ_CAT_NUM
104
CharSetProber.reset(self)
106
def get_charset_name(self):
107
return "windows-1252"
109
def feed(self, aBuf):
110
aBuf = self.filter_with_english_letters(aBuf)
112
charClass = Latin1_CharToClass[ord(c)]
113
freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass]
115
self._mState = constants.eNotMe
117
self._mFreqCounter[freq] += 1
118
self._mLastCharClass = charClass
120
return self.get_state()
122
def get_confidence(self):
123
if self.get_state() == constants.eNotMe:
126
total = reduce(operator.add, self._mFreqCounter)
130
confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total)
133
# lower the confidence of latin1 so that other more accurate detector
135
confidence = confidence * 0.5