~ubuntu-branches/ubuntu/karmic/calibre/karmic

« back to all changes in this revision

Viewing changes to src/calibre/ebooks/epub/iterator.py

  • Committer: Bazaar Package Importer
  • Author(s): Martin Pitt
  • Date: 2009-07-30 12:49:41 UTC
  • mfrom: (1.3.2 upstream)
  • Revision ID: james.westby@ubuntu.com-20090730124941-qjdsmri25zt8zocn
Tags: 0.6.3+dfsg-0ubuntu1
* New upstream release. Please see http://calibre.kovidgoyal.net/new_in_6/
  for the list of new features and changes.
* remove_postinstall.patch: Update for new version.
* build_debug.patch: Does not apply any more, disable for now. Might not be
  necessary any more.
* debian/copyright: Fix reference to versionless GPL.
* debian/rules: Drop obsolete dh_desktop call.
* debian/rules: Add workaround for weird Python 2.6 setuptools behaviour of
  putting compiled .so files into src/calibre/plugins/calibre/plugins
  instead of src/calibre/plugins.
* debian/rules: Drop hal fdi moving, new upstream version does not use hal
  any more. Drop hal dependency, too.
* debian/rules: Install udev rules into /lib/udev/rules.d.
* Add debian/calibre.preinst: Remove unmodified
  /etc/udev/rules.d/95-calibre.rules on upgrade.
* debian/control: Bump Python dependencies to 2.6, since upstream needs
  it now.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
__license__   = 'GPL v3'
2
 
__copyright__ = '2008 Kovid Goyal <kovid at kovidgoyal.net>'
3
 
 
4
 
'''
5
 
Iterate over the HTML files in an ebook. Useful for writing viewers.
6
 
'''
7
 
 
8
 
import re, os, math, copy
9
 
from cStringIO import StringIO
10
 
 
11
 
from PyQt4.Qt import QFontDatabase
12
 
 
13
 
from calibre.ebooks.epub.from_any import MAP
14
 
from calibre.ebooks.epub.from_html import TITLEPAGE
15
 
from calibre.ebooks.epub import config
16
 
from calibre.ebooks.metadata.opf2 import OPF
17
 
from calibre.ptempfile import TemporaryDirectory
18
 
from calibre.ebooks.chardet import xml_to_unicode
19
 
from calibre.ebooks.html import create_dir
20
 
from calibre.utils.zipfile import safe_replace, ZipFile
21
 
from calibre.utils.config import DynamicConfig
22
 
 
23
 
def character_count(html):
24
 
    '''
25
 
    Return the number of "significant" text characters in a HTML string.
26
 
    '''
27
 
    count = 0
28
 
    strip_space = re.compile(r'\s+')
29
 
    for match in re.finditer(r'>[^<]+<', html):
30
 
        count += len(strip_space.sub(' ', match.group()))-2
31
 
    return count
32
 
 
33
 
class UnsupportedFormatError(Exception):
34
 
 
35
 
    def __init__(self, fmt):
36
 
        Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
37
 
 
38
 
class SpineItem(unicode):
39
 
 
40
 
    def __new__(cls, *args):
41
 
        args = list(args)
42
 
        args[0] = args[0].partition('#')[0]
43
 
        obj = super(SpineItem, cls).__new__(cls, *args)
44
 
        path = args[0]
45
 
        raw = open(path, 'rb').read()
46
 
        raw, obj.encoding = xml_to_unicode(raw)
47
 
        obj.character_count = character_count(raw)
48
 
        obj.start_page = -1
49
 
        obj.pages      = -1
50
 
        obj.max_page   = -1
51
 
        return obj
52
 
 
53
 
def html2opf(path, tdir, opts):
54
 
    opts = copy.copy(opts)
55
 
    opts.output = tdir
56
 
    create_dir(path, opts)
57
 
    return os.path.join(tdir, 'metadata.opf')
58
 
 
59
 
def opf2opf(path, tdir, opts):
60
 
    return path
61
 
 
62
 
def is_supported(path):
63
 
    ext = os.path.splitext(path)[1].replace('.', '').lower()
64
 
    ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
65
 
    return ext in list(MAP.keys())+['html', 'opf']
66
 
 
67
 
class EbookIterator(object):
68
 
 
69
 
    CHARACTERS_PER_PAGE = 1000
70
 
 
71
 
    def __init__(self, pathtoebook):
72
 
        pathtoebook = pathtoebook.strip()
73
 
        self.pathtoebook = os.path.abspath(pathtoebook)
74
 
        self.config = DynamicConfig(name='iterator')
75
 
        ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
76
 
        ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
77
 
        map = dict(MAP)
78
 
        map['html'] = html2opf
79
 
        map['opf']  = opf2opf
80
 
        if ext not in map.keys():
81
 
            raise UnsupportedFormatError(ext)
82
 
        self.to_opf = map[ext]
83
 
 
84
 
    def search(self, text, index):
85
 
        text = text.lower()
86
 
        for i, path in enumerate(self.spine):
87
 
            if i > index:
88
 
                if text in open(path, 'rb').read().decode(path.encoding).lower():
89
 
                    return i
90
 
 
91
 
    def find_embedded_fonts(self):
92
 
        '''
93
 
        This will become unnecessary once Qt WebKit supports the @font-face rule.
94
 
        '''
95
 
        for item in self.opf.manifest:
96
 
            if item.mime_type and 'css' in item.mime_type.lower():
97
 
                css = open(item.path, 'rb').read().decode('utf-8', 'replace')
98
 
                for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css):
99
 
                    block  = match.group(1)
100
 
                    family = re.compile(r'font-family\s*:\s*([^;]+)').search(block)
101
 
                    url    = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block)
102
 
                    if url:
103
 
                        path = url.group(1).split('/')
104
 
                        path = os.path.join(os.path.dirname(item.path), *path)
105
 
                        id = QFontDatabase.addApplicationFont(path)
106
 
                        if id != -1:
107
 
                            families = [unicode(f) for f in QFontDatabase.applicationFontFamilies(id)]
108
 
                            if family:
109
 
                                family = family.group(1).strip().replace('"', '')
110
 
                                if family not in families:
111
 
                                    print 'WARNING: Family aliasing not supported:', block
112
 
                                else:
113
 
                                    print 'Loaded embedded font:', repr(family)
114
 
 
115
 
    def __enter__(self):
116
 
        self._tdir = TemporaryDirectory('_ebook_iter')
117
 
        self.base  = self._tdir.__enter__()
118
 
        opts = config('').parse()
119
 
        self.pathtoopf = self.to_opf(self.pathtoebook, self.base, opts)
120
 
        self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
121
 
        self.spine = [SpineItem(i.path) for i in self.opf.spine]
122
 
 
123
 
        cover = self.opf.cover
124
 
        if os.path.splitext(self.pathtoebook)[1].lower() in \
125
 
                                    ('.lit', '.mobi', '.prc') and cover:
126
 
            cfile = os.path.join(os.path.dirname(self.spine[0]), 'calibre_ei_cover.html')
127
 
            open(cfile, 'wb').write(TITLEPAGE%cover)
128
 
            self.spine[0:0] = [SpineItem(cfile)]
129
 
 
130
 
        if self.opf.path_to_html_toc is not None and \
131
 
           self.opf.path_to_html_toc not in self.spine:
132
 
            self.spine.append(SpineItem(self.opf.path_to_html_toc))
133
 
 
134
 
 
135
 
        sizes = [i.character_count for i in self.spine]
136
 
        self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
137
 
        for p, s in zip(self.pages, self.spine):
138
 
            s.pages = p
139
 
        start = 1
140
 
 
141
 
 
142
 
        for s in self.spine:
143
 
            s.start_page = start
144
 
            start += s.pages
145
 
            s.max_page = s.start_page + s.pages - 1
146
 
        self.toc = self.opf.toc
147
 
 
148
 
        self.find_embedded_fonts()
149
 
        self.read_bookmarks()
150
 
 
151
 
        return self
152
 
 
153
 
    def parse_bookmarks(self, raw):
154
 
        for line in raw.splitlines():
155
 
            if line.count('^') > 0:
156
 
                tokens = line.rpartition('^')
157
 
                title, ref = tokens[0], tokens[2]
158
 
                self.bookmarks.append((title, ref))
159
 
 
160
 
    def serialize_bookmarks(self, bookmarks):
161
 
        dat = []
162
 
        for title, bm in bookmarks:
163
 
            dat.append(u'%s^%s'%(title, bm))
164
 
        return (u'\n'.join(dat) +'\n').encode('utf-8')
165
 
 
166
 
    def read_bookmarks(self):
167
 
        self.bookmarks = []
168
 
        bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
169
 
        raw = ''
170
 
        if os.path.exists(bmfile):
171
 
            raw = open(bmfile, 'rb').read().decode('utf-8')
172
 
        else:
173
 
            saved = self.config['bookmarks_'+self.pathtoebook]
174
 
            if saved:
175
 
                raw = saved
176
 
        self.parse_bookmarks(raw)
177
 
 
178
 
    def save_bookmarks(self, bookmarks=None):
179
 
        if bookmarks is None:
180
 
            bookmarks = self.bookmarks
181
 
        dat = self.serialize_bookmarks(bookmarks)
182
 
        if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
183
 
            os.access(self.pathtoebook, os.R_OK):
184
 
            try:
185
 
                zf = open(self.pathtoebook, 'r+b')
186
 
            except IOError:
187
 
                return
188
 
            zipf = ZipFile(zf, mode='a')
189
 
            for name in zipf.namelist():
190
 
                if name == 'META-INF/calibre_bookmarks.txt':
191
 
                    safe_replace(zf, 'META-INF/calibre_bookmarks.txt', StringIO(dat))
192
 
                    return
193
 
            zipf.writestr('META-INF/calibre_bookmarks.txt', dat)
194
 
        else:
195
 
            self.config['bookmarks_'+self.pathtoebook] = dat
196
 
 
197
 
    def add_bookmark(self, bm):
198
 
        dups = []
199
 
        for x in self.bookmarks:
200
 
            if x[0] == bm[0]:
201
 
                dups.append(x)
202
 
        for x in dups:
203
 
            self.bookmarks.remove(x)
204
 
        self.bookmarks.append(bm)
205
 
        self.save_bookmarks()
206
 
 
207
 
    def __exit__(self, *args):
208
 
        self._tdir.__exit__(*args)