~ubuntu-branches/ubuntu/karmic/calibre/karmic

« back to all changes in this revision

Viewing changes to src/calibre/ebooks/oeb/iterator.py

  • Committer: Bazaar Package Importer
  • Author(s): Martin Pitt
  • Date: 2009-07-30 12:49:41 UTC
  • mfrom: (1.3.2 upstream)
  • Revision ID: james.westby@ubuntu.com-20090730124941-qjdsmri25zt8zocn
Tags: 0.6.3+dfsg-0ubuntu1
* New upstream release. Please see http://calibre.kovidgoyal.net/new_in_6/
  for the list of new features and changes.
* remove_postinstall.patch: Update for new version.
* build_debug.patch: Does not apply any more, disable for now. Might not be
  necessary any more.
* debian/copyright: Fix reference to versionless GPL.
* debian/rules: Drop obsolete dh_desktop call.
* debian/rules: Add workaround for weird Python 2.6 setuptools behaviour of
  putting compiled .so files into src/calibre/plugins/calibre/plugins
  instead of src/calibre/plugins.
* debian/rules: Drop hal fdi moving, new upstream version does not use hal
  any more. Drop hal dependency, too.
* debian/rules: Install udev rules into /lib/udev/rules.d.
* Add debian/calibre.preinst: Remove unmodified
  /etc/udev/rules.d/95-calibre.rules on upgrade.
* debian/control: Bump Python dependencies to 2.6, since upstream needs
  it now.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
from __future__ import with_statement
 
2
__license__   = 'GPL v3'
 
3
__copyright__ = '2008 Kovid Goyal <kovid at kovidgoyal.net>'
 
4
 
 
5
'''
 
6
Iterate over the HTML files in an ebook. Useful for writing viewers.
 
7
'''
 
8
 
 
9
import re, os, math
 
10
from cStringIO import StringIO
 
11
 
 
12
from PyQt4.Qt import QFontDatabase
 
13
 
 
14
from calibre.customize.ui import available_input_formats
 
15
from calibre.ebooks.metadata.opf2 import OPF
 
16
from calibre.ptempfile import TemporaryDirectory
 
17
from calibre.ebooks.chardet import xml_to_unicode
 
18
from calibre.utils.zipfile import safe_replace, ZipFile
 
19
from calibre.utils.config import DynamicConfig
 
20
from calibre.utils.logging import Log
 
21
from calibre.ebooks.epub.output import EPUBOutput
 
22
 
 
23
TITLEPAGE = EPUBOutput.TITLEPAGE_COVER.decode('utf-8')
 
24
 
 
25
def character_count(html):
 
26
    '''
 
27
    Return the number of "significant" text characters in a HTML string.
 
28
    '''
 
29
    count = 0
 
30
    strip_space = re.compile(r'\s+')
 
31
    for match in re.finditer(r'>[^<]+<', html):
 
32
        count += len(strip_space.sub(' ', match.group()))-2
 
33
    return count
 
34
 
 
35
class UnsupportedFormatError(Exception):
 
36
 
 
37
    def __init__(self, fmt):
 
38
        Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
 
39
 
 
40
class SpineItem(unicode):
 
41
 
 
42
    def __new__(cls, *args):
 
43
        args = list(args)
 
44
        path = args[0]
 
45
        ppath = path.partition('#')[0]
 
46
        if not os.path.exists(path) and os.path.exists(ppath):
 
47
            path = ppath
 
48
        args[0] = path
 
49
        obj = super(SpineItem, cls).__new__(cls, *args)
 
50
        raw = open(path, 'rb').read()
 
51
        raw, obj.encoding = xml_to_unicode(raw)
 
52
        obj.character_count = character_count(raw)
 
53
        obj.start_page = -1
 
54
        obj.pages      = -1
 
55
        obj.max_page   = -1
 
56
        return obj
 
57
 
 
58
class FakeOpts(object):
 
59
    verbose = 0
 
60
    breadth_first = False
 
61
    max_levels = 5
 
62
    input_encoding = None
 
63
 
 
64
def is_supported(path):
 
65
    ext = os.path.splitext(path)[1].replace('.', '').lower()
 
66
    ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
 
67
    return ext in available_input_formats()
 
68
 
 
69
 
 
70
def write_oebbook(oeb, path):
 
71
    from calibre.ebooks.oeb.writer import OEBWriter
 
72
    from calibre import walk
 
73
    w = OEBWriter()
 
74
    w(oeb, path)
 
75
    for f in walk(path):
 
76
        if f.endswith('.opf'):
 
77
            return f
 
78
 
 
79
class EbookIterator(object):
 
80
 
 
81
    CHARACTERS_PER_PAGE = 1000
 
82
 
 
83
    def __init__(self, pathtoebook, log=None):
 
84
        self.log = log
 
85
        if log is None:
 
86
            self.log = Log()
 
87
        pathtoebook = pathtoebook.strip()
 
88
        self.pathtoebook = os.path.abspath(pathtoebook)
 
89
        self.config = DynamicConfig(name='iterator')
 
90
        ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
 
91
        ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
 
92
        self.ebook_ext = ext
 
93
 
 
94
    def search(self, text, index):
 
95
        text = text.lower()
 
96
        for i, path in enumerate(self.spine):
 
97
            if i > index:
 
98
                if text in open(path, 'rb').read().decode(path.encoding).lower():
 
99
                    return i
 
100
 
 
101
    def find_embedded_fonts(self):
 
102
        '''
 
103
        This will become unnecessary once Qt WebKit supports the @font-face rule.
 
104
        '''
 
105
        for item in self.opf.manifest:
 
106
            if item.mime_type and 'css' in item.mime_type.lower():
 
107
                css = open(item.path, 'rb').read().decode('utf-8', 'replace')
 
108
                for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css):
 
109
                    block  = match.group(1)
 
110
                    family = re.compile(r'font-family\s*:\s*([^;]+)').search(block)
 
111
                    url    = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block)
 
112
                    if url:
 
113
                        path = url.group(1).split('/')
 
114
                        path = os.path.join(os.path.dirname(item.path), *path)
 
115
                        id = QFontDatabase.addApplicationFont(path)
 
116
                        if id != -1:
 
117
                            families = [unicode(f) for f in QFontDatabase.applicationFontFamilies(id)]
 
118
                            if family:
 
119
                                family = family.group(1).strip().replace('"', '')
 
120
                                if family not in families:
 
121
                                    print 'WARNING: Family aliasing not supported:', block
 
122
                                else:
 
123
                                    print 'Loaded embedded font:', repr(family)
 
124
 
 
125
    def __enter__(self):
 
126
        self.delete_on_exit = []
 
127
        self._tdir = TemporaryDirectory('_ebook_iter')
 
128
        self.base  = self._tdir.__enter__()
 
129
        from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
 
130
        plumber = Plumber(self.pathtoebook, self.base, self.log)
 
131
        plumber.setup_options()
 
132
        if self.pathtoebook.lower().endswith('.opf'):
 
133
            plumber.opts.dont_package = True
 
134
        if hasattr(plumber.opts, 'no_process'):
 
135
            plumber.opts.no_process = True
 
136
 
 
137
        plumber.input_plugin.for_viewer = True
 
138
        self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
 
139
                plumber.opts, plumber.input_fmt, self.log,
 
140
                {}, self.base)
 
141
 
 
142
        if plumber.input_fmt.lower() in ('pdf', 'rb'):
 
143
            self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
 
144
                    plumber.input_plugin)
 
145
        if hasattr(self.pathtoopf, 'manifest'):
 
146
            self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
 
147
 
 
148
 
 
149
        self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
 
150
        self.language = self.opf.language
 
151
        if self.language:
 
152
            self.language = self.language.lower()
 
153
        self.spine = [SpineItem(i.path) for i in self.opf.spine]
 
154
 
 
155
        cover = self.opf.cover
 
156
        if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf') and cover:
 
157
            cfile = os.path.join(os.path.dirname(self.spine[0]),
 
158
                    'calibre_iterator_cover.html')
 
159
            chtml = (TITLEPAGE%cover).encode('utf-8')
 
160
            open(cfile, 'wb').write(chtml)
 
161
            self.spine[0:0] = [SpineItem(cfile)]
 
162
            self.delete_on_exit.append(cfile)
 
163
 
 
164
        if self.opf.path_to_html_toc is not None and \
 
165
           self.opf.path_to_html_toc not in self.spine:
 
166
            try:
 
167
                self.spine.append(SpineItem(self.opf.path_to_html_toc))
 
168
            except:
 
169
                import traceback
 
170
                traceback.print_exc()
 
171
 
 
172
 
 
173
        sizes = [i.character_count for i in self.spine]
 
174
        self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
 
175
        for p, s in zip(self.pages, self.spine):
 
176
            s.pages = p
 
177
        start = 1
 
178
 
 
179
        for s in self.spine:
 
180
            s.start_page = start
 
181
            start += s.pages
 
182
            s.max_page = s.start_page + s.pages - 1
 
183
        self.toc = self.opf.toc
 
184
 
 
185
        self.find_embedded_fonts()
 
186
        self.read_bookmarks()
 
187
 
 
188
        return self
 
189
 
 
190
    def parse_bookmarks(self, raw):
 
191
        for line in raw.splitlines():
 
192
            if line.count('^') > 0:
 
193
                tokens = line.rpartition('^')
 
194
                title, ref = tokens[0], tokens[2]
 
195
                self.bookmarks.append((title, ref))
 
196
 
 
197
    def serialize_bookmarks(self, bookmarks):
 
198
        dat = []
 
199
        for title, bm in bookmarks:
 
200
            dat.append(u'%s^%s'%(title, bm))
 
201
        return (u'\n'.join(dat) +'\n').encode('utf-8')
 
202
 
 
203
    def read_bookmarks(self):
 
204
        self.bookmarks = []
 
205
        bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
 
206
        raw = ''
 
207
        if os.path.exists(bmfile):
 
208
            raw = open(bmfile, 'rb').read().decode('utf-8')
 
209
        else:
 
210
            saved = self.config['bookmarks_'+self.pathtoebook]
 
211
            if saved:
 
212
                raw = saved
 
213
        self.parse_bookmarks(raw)
 
214
 
 
215
    def save_bookmarks(self, bookmarks=None):
 
216
        if bookmarks is None:
 
217
            bookmarks = self.bookmarks
 
218
        dat = self.serialize_bookmarks(bookmarks)
 
219
        if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
 
220
            os.access(self.pathtoebook, os.R_OK):
 
221
            try:
 
222
                zf = open(self.pathtoebook, 'r+b')
 
223
            except IOError:
 
224
                return
 
225
            zipf = ZipFile(zf, mode='a')
 
226
            for name in zipf.namelist():
 
227
                if name == 'META-INF/calibre_bookmarks.txt':
 
228
                    safe_replace(zf, 'META-INF/calibre_bookmarks.txt', StringIO(dat))
 
229
                    return
 
230
            zipf.writestr('META-INF/calibre_bookmarks.txt', dat)
 
231
        else:
 
232
            self.config['bookmarks_'+self.pathtoebook] = dat
 
233
 
 
234
    def add_bookmark(self, bm):
 
235
        dups = []
 
236
        for x in self.bookmarks:
 
237
            if x[0] == bm[0]:
 
238
                dups.append(x)
 
239
        for x in dups:
 
240
            self.bookmarks.remove(x)
 
241
        self.bookmarks.append(bm)
 
242
        self.save_bookmarks()
 
243
 
 
244
    def set_bookmarks(self, bookmarks):
 
245
        self.bookmarks = bookmarks
 
246
 
 
247
    def __exit__(self, *args):
 
248
        self._tdir.__exit__(*args)
 
249
        for x in self.delete_on_exit:
 
250
            if os.path.exists(x):
 
251
                os.remove(x)