~rcj/vmbuilder/jenkins_kvm-test

« back to all changes in this revision

Viewing changes to pylib/requests/packages/chardet/sbcharsetprober.py

Committer: Ben Howard
Date: 2014-08-19 20:30:00 UTC
Revision ID: ben.howard@ubuntu.com-20140819203000-9gfgaryo1w41orxu

12.04 does not ship with a version of python3-requests, so we need
to provided it.

files added:
pylib

pylib/requests

pylib/requests/__init__.py

pylib/requests/adapters.py

pylib/requests/api.py

pylib/requests/auth.py

pylib/requests/cacert.pem

pylib/requests/certs.py

pylib/requests/compat.py

pylib/requests/cookies.py

pylib/requests/exceptions.py

pylib/requests/hooks.py

pylib/requests/models.py

pylib/requests/packages

pylib/requests/packages/__init__.py

pylib/requests/packages/chardet

pylib/requests/packages/chardet/__init__.py

pylib/requests/packages/chardet/big5freq.py

pylib/requests/packages/chardet/big5prober.py

pylib/requests/packages/chardet/chardetect.py

pylib/requests/packages/chardet/chardistribution.py

pylib/requests/packages/chardet/charsetgroupprober.py

pylib/requests/packages/chardet/charsetprober.py

pylib/requests/packages/chardet/codingstatemachine.py

pylib/requests/packages/chardet/compat.py

pylib/requests/packages/chardet/constants.py

pylib/requests/packages/chardet/cp949prober.py

pylib/requests/packages/chardet/escprober.py

pylib/requests/packages/chardet/escsm.py

pylib/requests/packages/chardet/eucjpprober.py

pylib/requests/packages/chardet/euckrfreq.py

pylib/requests/packages/chardet/euckrprober.py

pylib/requests/packages/chardet/euctwfreq.py

pylib/requests/packages/chardet/euctwprober.py

pylib/requests/packages/chardet/gb2312freq.py

pylib/requests/packages/chardet/gb2312prober.py

pylib/requests/packages/chardet/hebrewprober.py

pylib/requests/packages/chardet/jisfreq.py

pylib/requests/packages/chardet/jpcntx.py

pylib/requests/packages/chardet/langbulgarianmodel.py

pylib/requests/packages/chardet/langcyrillicmodel.py

pylib/requests/packages/chardet/langgreekmodel.py

pylib/requests/packages/chardet/langhebrewmodel.py

pylib/requests/packages/chardet/langhungarianmodel.py

pylib/requests/packages/chardet/langthaimodel.py

pylib/requests/packages/chardet/latin1prober.py

pylib/requests/packages/chardet/mbcharsetprober.py

pylib/requests/packages/chardet/mbcsgroupprober.py

pylib/requests/packages/chardet/mbcssm.py

pylib/requests/packages/chardet/sbcharsetprober.py

pylib/requests/packages/chardet/sbcsgroupprober.py

pylib/requests/packages/chardet/sjisprober.py

pylib/requests/packages/chardet/universaldetector.py

pylib/requests/packages/chardet/utf8prober.py

pylib/requests/packages/urllib3

pylib/requests/packages/urllib3/__init__.py

pylib/requests/packages/urllib3/_collections.py

pylib/requests/packages/urllib3/connection.py

pylib/requests/packages/urllib3/connectionpool.py

pylib/requests/packages/urllib3/contrib

pylib/requests/packages/urllib3/contrib/__init__.py

pylib/requests/packages/urllib3/contrib/ntlmpool.py

pylib/requests/packages/urllib3/contrib/pyopenssl.py

pylib/requests/packages/urllib3/exceptions.py

pylib/requests/packages/urllib3/fields.py

pylib/requests/packages/urllib3/filepost.py

pylib/requests/packages/urllib3/packages

pylib/requests/packages/urllib3/packages/__init__.py

pylib/requests/packages/urllib3/packages/ordered_dict.py

pylib/requests/packages/urllib3/packages/six.py

pylib/requests/packages/urllib3/packages/ssl_match_hostname

pylib/requests/packages/urllib3/packages/ssl_match_hostname/__init__.py

pylib/requests/packages/urllib3/packages/ssl_match_hostname/_implementation.py

pylib/requests/packages/urllib3/poolmanager.py

pylib/requests/packages/urllib3/request.py

pylib/requests/packages/urllib3/response.py

pylib/requests/packages/urllib3/util

pylib/requests/packages/urllib3/util/__init__.py

pylib/requests/packages/urllib3/util/connection.py

pylib/requests/packages/urllib3/util/request.py

pylib/requests/packages/urllib3/util/response.py

pylib/requests/packages/urllib3/util/ssl_.py

pylib/requests/packages/urllib3/util/timeout.py

pylib/requests/packages/urllib3/util/url.py

pylib/requests/sessions.py

pylib/requests/status_codes.py

pylib/requests/structures.py

pylib/requests/utils.py

Show diffs side-by-side

added added

removed removed

pylib/requests/packages/chardet/sbcharsetprober.py

######################## BEGIN LICENSE BLOCK ########################

# The Original Code is Mozilla Universal charset detector code.

# The Initial Developer of the Original Code is

# Netscape Communications Corporation.

# Contributor(s):

# Mark Pilgrim - port to Python

# Shy Shalom - original C code

# This library is free software; you can redistribute it and/or

# modify it under the terms of the GNU Lesser General Public

# License as published by the Free Software Foundation; either

# version 2.1 of the License, or (at your option) any later version.

# This library is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public

# License along with this library; if not, write to the Free Software

# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

# 02110-1301 USA

######################### END LICENSE BLOCK #########################

import sys

from . import constants

from .charsetprober import CharSetProber

from .compat import wrap_ord

SAMPLE_SIZE = 64

SB_ENOUGH_REL_THRESHOLD = 1024

POSITIVE_SHORTCUT_THRESHOLD = 0.95

NEGATIVE_SHORTCUT_THRESHOLD = 0.05

SYMBOL_CAT_ORDER = 250

NUMBER_OF_SEQ_CAT = 4

POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1

#NEGATIVE_CAT = 0

class SingleByteCharSetProber(CharSetProber):

def __init__(self, model, reversed=False, nameProber=None):

CharSetProber.__init__(self)

self._mModel = model

# TRUE if we need to reverse every pair in the model lookup

self._mReversed = reversed

# Optional auxiliary prober for name decision

self._mNameProber = nameProber

self.reset()

def reset(self):

CharSetProber.reset(self)

# char order of last character

self._mLastOrder = 255

self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT

self._mTotalSeqs = 0

self._mTotalChar = 0

# characters that fall in our sampling range

self._mFreqChar = 0

def get_charset_name(self):

if self._mNameProber:

return self._mNameProber.get_charset_name()

else:

return self._mModel['charsetName']

def feed(self, aBuf):

if not self._mModel['keepEnglishLetter']:

aBuf = self.filter_without_english_letters(aBuf)

aLen = len(aBuf)

if not aLen:

return self.get_state()

for c in aBuf:

order = self._mModel['charToOrderMap'][wrap_ord(c)]

if order < SYMBOL_CAT_ORDER:

self._mTotalChar += 1

if order < SAMPLE_SIZE:

self._mFreqChar += 1

if self._mLastOrder < SAMPLE_SIZE:

self._mTotalSeqs += 1

if not self._mReversed:

i = (self._mLastOrder * SAMPLE_SIZE) + order

model = self._mModel['precedenceMatrix'][i]

else: # reverse the order of the letters in the lookup

i = (order * SAMPLE_SIZE) + self._mLastOrder

model = self._mModel['precedenceMatrix'][i]

self._mSeqCounters[model] += 1

self._mLastOrder = order

if self.get_state() == constants.eDetecting:

if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD:

cf = self.get_confidence()

if cf > POSITIVE_SHORTCUT_THRESHOLD:

if constants._debug:

sys.stderr.write('%s confidence = %s, we have a'

'winner\n' %

100

(self._mModel['charsetName'], cf))

101

self._mState = constants.eFoundIt

102

elif cf < NEGATIVE_SHORTCUT_THRESHOLD:

103

if constants._debug:

104

sys.stderr.write('%s confidence = %s, below negative'

105

'shortcut threshhold %s\n' %

106

(self._mModel['charsetName'], cf,

107

NEGATIVE_SHORTCUT_THRESHOLD))

108

self._mState = constants.eNotMe

109

110

return self.get_state()

111

112

def get_confidence(self):

113

r = 0.01

114

if self._mTotalSeqs > 0:

115

r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs

116

/ self._mModel['mTypicalPositiveRatio'])

117

r = r * self._mFreqChar / self._mTotalChar

118

if r >= 1.0:

119

r = 0.99

120

return r

Older »