~rcj/vmbuilder/jenkins_kvm-test

« back to all changes in this revision

Viewing changes to pylib/requests/packages/chardet/sbcharsetprober.py

  • Committer: Ben Howard
  • Date: 2014-08-19 20:30:00 UTC
  • Revision ID: ben.howard@ubuntu.com-20140819203000-9gfgaryo1w41orxu
12.04 does not ship with a version of python3-requests, so we need
to provided it.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
######################## BEGIN LICENSE BLOCK ########################
 
2
# The Original Code is Mozilla Universal charset detector code.
 
3
#
 
4
# The Initial Developer of the Original Code is
 
5
# Netscape Communications Corporation.
 
6
# Portions created by the Initial Developer are Copyright (C) 2001
 
7
# the Initial Developer. All Rights Reserved.
 
8
#
 
9
# Contributor(s):
 
10
#   Mark Pilgrim - port to Python
 
11
#   Shy Shalom - original C code
 
12
#
 
13
# This library is free software; you can redistribute it and/or
 
14
# modify it under the terms of the GNU Lesser General Public
 
15
# License as published by the Free Software Foundation; either
 
16
# version 2.1 of the License, or (at your option) any later version.
 
17
#
 
18
# This library is distributed in the hope that it will be useful,
 
19
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 
20
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 
21
# Lesser General Public License for more details.
 
22
#
 
23
# You should have received a copy of the GNU Lesser General Public
 
24
# License along with this library; if not, write to the Free Software
 
25
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 
26
# 02110-1301  USA
 
27
######################### END LICENSE BLOCK #########################
 
28
 
 
29
import sys
 
30
from . import constants
 
31
from .charsetprober import CharSetProber
 
32
from .compat import wrap_ord
 
33
 
 
34
SAMPLE_SIZE = 64
 
35
SB_ENOUGH_REL_THRESHOLD = 1024
 
36
POSITIVE_SHORTCUT_THRESHOLD = 0.95
 
37
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
 
38
SYMBOL_CAT_ORDER = 250
 
39
NUMBER_OF_SEQ_CAT = 4
 
40
POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
 
41
#NEGATIVE_CAT = 0
 
42
 
 
43
 
 
44
class SingleByteCharSetProber(CharSetProber):
 
45
    def __init__(self, model, reversed=False, nameProber=None):
 
46
        CharSetProber.__init__(self)
 
47
        self._mModel = model
 
48
        # TRUE if we need to reverse every pair in the model lookup
 
49
        self._mReversed = reversed
 
50
        # Optional auxiliary prober for name decision
 
51
        self._mNameProber = nameProber
 
52
        self.reset()
 
53
 
 
54
    def reset(self):
 
55
        CharSetProber.reset(self)
 
56
        # char order of last character
 
57
        self._mLastOrder = 255
 
58
        self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
 
59
        self._mTotalSeqs = 0
 
60
        self._mTotalChar = 0
 
61
        # characters that fall in our sampling range
 
62
        self._mFreqChar = 0
 
63
 
 
64
    def get_charset_name(self):
 
65
        if self._mNameProber:
 
66
            return self._mNameProber.get_charset_name()
 
67
        else:
 
68
            return self._mModel['charsetName']
 
69
 
 
70
    def feed(self, aBuf):
 
71
        if not self._mModel['keepEnglishLetter']:
 
72
            aBuf = self.filter_without_english_letters(aBuf)
 
73
        aLen = len(aBuf)
 
74
        if not aLen:
 
75
            return self.get_state()
 
76
        for c in aBuf:
 
77
            order = self._mModel['charToOrderMap'][wrap_ord(c)]
 
78
            if order < SYMBOL_CAT_ORDER:
 
79
                self._mTotalChar += 1
 
80
            if order < SAMPLE_SIZE:
 
81
                self._mFreqChar += 1
 
82
                if self._mLastOrder < SAMPLE_SIZE:
 
83
                    self._mTotalSeqs += 1
 
84
                    if not self._mReversed:
 
85
                        i = (self._mLastOrder * SAMPLE_SIZE) + order
 
86
                        model = self._mModel['precedenceMatrix'][i]
 
87
                    else:  # reverse the order of the letters in the lookup
 
88
                        i = (order * SAMPLE_SIZE) + self._mLastOrder
 
89
                        model = self._mModel['precedenceMatrix'][i]
 
90
                    self._mSeqCounters[model] += 1
 
91
            self._mLastOrder = order
 
92
 
 
93
        if self.get_state() == constants.eDetecting:
 
94
            if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD:
 
95
                cf = self.get_confidence()
 
96
                if cf > POSITIVE_SHORTCUT_THRESHOLD:
 
97
                    if constants._debug:
 
98
                        sys.stderr.write('%s confidence = %s, we have a'
 
99
                                         'winner\n' %
 
100
                                         (self._mModel['charsetName'], cf))
 
101
                    self._mState = constants.eFoundIt
 
102
                elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
 
103
                    if constants._debug:
 
104
                        sys.stderr.write('%s confidence = %s, below negative'
 
105
                                         'shortcut threshhold %s\n' %
 
106
                                         (self._mModel['charsetName'], cf,
 
107
                                          NEGATIVE_SHORTCUT_THRESHOLD))
 
108
                    self._mState = constants.eNotMe
 
109
 
 
110
        return self.get_state()
 
111
 
 
112
    def get_confidence(self):
 
113
        r = 0.01
 
114
        if self._mTotalSeqs > 0:
 
115
            r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs
 
116
                 / self._mModel['mTypicalPositiveRatio'])
 
117
            r = r * self._mFreqChar / self._mTotalChar
 
118
            if r >= 1.0:
 
119
                r = 0.99
 
120
        return r