~ubuntu-branches/debian/sid/calibre/sid

« back to all changes in this revision

Viewing changes to src/calibre/ebooks/chardet.py

  • Committer: Package Import Robot
  • Author(s): Martin Pitt
  • Date: 2014-02-27 07:48:06 UTC
  • mto: This revision was merged to the branch mainline in revision 74.
  • Revision ID: package-import@ubuntu.com-20140227074806-64wdebb3ptosxhhx
Tags: upstream-1.25.0+dfsg
ImportĀ upstreamĀ versionĀ 1.25.0+dfsg

Show diffs side-by-side

added added

removed removed

Lines of Context:
10
10
import re, codecs
11
11
 
12
12
ENCODING_PATS = [
13
 
                # XML declaration
14
 
                 re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
15
 
                            re.IGNORECASE),
16
 
                 # HTML 4 Pragma directive
17
 
                 re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
18
 
                            re.IGNORECASE),
19
 
                 # HTML 5 charset
20
 
                 re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>''',
21
 
                     re.IGNORECASE),
22
 
                 ]
 
13
    # XML declaration
 
14
    re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
 
15
    # HTML 4 Pragma directive
 
16
    re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
 
17
    # HTML 5 charset
 
18
    re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
 
19
]
23
20
ENTITY_PATTERN = re.compile(r'&(\S+?);')
24
21
 
25
22
def strip_encoding_declarations(raw):
35
32
    from calibre import xml_entity_to_unicode
36
33
    return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
37
34
 
38
 
_CHARSET_ALIASES = { "macintosh" : "mac-roman",
39
 
                        "x-sjis" : "shift-jis" }
 
35
_CHARSET_ALIASES = {"macintosh" : "mac-roman",
 
36
                        "x-sjis" : "shift-jis"}
40
37
 
41
38
def detect(*args, **kwargs):
42
39
    from chardet import detect
58
55
    if not encoding:
59
56
        encoding = preferred_encoding
60
57
    encoding = encoding.lower()
61
 
    if _CHARSET_ALIASES.has_key(encoding):
62
 
        encoding = _CHARSET_ALIASES[encoding]
 
58
    encoding = _CHARSET_ALIASES.get(encoding, encoding)
63
59
    if encoding == 'ascii':
64
60
        encoding = 'utf-8'
65
61
    return encoding