1
from __future__ import with_statement
3
__copyright__ = '2008 Kovid Goyal <kovid at kovidgoyal.net>'
6
Iterate over the HTML files in an ebook. Useful for writing viewers.
10
from cStringIO import StringIO
12
from PyQt4.Qt import QFontDatabase
14
from calibre.customize.ui import available_input_formats
15
from calibre.ebooks.metadata.opf2 import OPF
16
from calibre.ptempfile import TemporaryDirectory
17
from calibre.ebooks.chardet import xml_to_unicode
18
from calibre.utils.zipfile import safe_replace, ZipFile
19
from calibre.utils.config import DynamicConfig
20
from calibre.utils.logging import Log
21
from calibre.ebooks.epub.output import EPUBOutput
23
TITLEPAGE = EPUBOutput.TITLEPAGE_COVER.decode('utf-8')
25
def character_count(html):
27
Return the number of "significant" text characters in a HTML string.
30
strip_space = re.compile(r'\s+')
31
for match in re.finditer(r'>[^<]+<', html):
32
count += len(strip_space.sub(' ', match.group()))-2
35
class UnsupportedFormatError(Exception):
37
def __init__(self, fmt):
38
Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
40
class SpineItem(unicode):
42
def __new__(cls, *args):
45
ppath = path.partition('#')[0]
46
if not os.path.exists(path) and os.path.exists(ppath):
49
obj = super(SpineItem, cls).__new__(cls, *args)
50
raw = open(path, 'rb').read()
51
raw, obj.encoding = xml_to_unicode(raw)
52
obj.character_count = character_count(raw)
58
class FakeOpts(object):
64
def is_supported(path):
65
ext = os.path.splitext(path)[1].replace('.', '').lower()
66
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
67
return ext in available_input_formats()
70
def write_oebbook(oeb, path):
71
from calibre.ebooks.oeb.writer import OEBWriter
72
from calibre import walk
76
if f.endswith('.opf'):
79
class EbookIterator(object):
81
CHARACTERS_PER_PAGE = 1000
83
def __init__(self, pathtoebook, log=None):
87
pathtoebook = pathtoebook.strip()
88
self.pathtoebook = os.path.abspath(pathtoebook)
89
self.config = DynamicConfig(name='iterator')
90
ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
91
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
94
def search(self, text, index):
96
for i, path in enumerate(self.spine):
98
if text in open(path, 'rb').read().decode(path.encoding).lower():
101
def find_embedded_fonts(self):
103
This will become unnecessary once Qt WebKit supports the @font-face rule.
105
for item in self.opf.manifest:
106
if item.mime_type and 'css' in item.mime_type.lower():
107
css = open(item.path, 'rb').read().decode('utf-8', 'replace')
108
for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css):
109
block = match.group(1)
110
family = re.compile(r'font-family\s*:\s*([^;]+)').search(block)
111
url = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block)
113
path = url.group(1).split('/')
114
path = os.path.join(os.path.dirname(item.path), *path)
115
id = QFontDatabase.addApplicationFont(path)
117
families = [unicode(f) for f in QFontDatabase.applicationFontFamilies(id)]
119
family = family.group(1).strip().replace('"', '')
120
if family not in families:
121
print 'WARNING: Family aliasing not supported:', block
123
print 'Loaded embedded font:', repr(family)
126
self.delete_on_exit = []
127
self._tdir = TemporaryDirectory('_ebook_iter')
128
self.base = self._tdir.__enter__()
129
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
130
plumber = Plumber(self.pathtoebook, self.base, self.log)
131
plumber.setup_options()
132
if self.pathtoebook.lower().endswith('.opf'):
133
plumber.opts.dont_package = True
134
if hasattr(plumber.opts, 'no_process'):
135
plumber.opts.no_process = True
137
plumber.input_plugin.for_viewer = True
138
self.pathtoopf = plumber.input_plugin(open(plumber.input, 'rb'),
139
plumber.opts, plumber.input_fmt, self.log,
142
if plumber.input_fmt.lower() in ('pdf', 'rb'):
143
self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
144
plumber.input_plugin)
145
if hasattr(self.pathtoopf, 'manifest'):
146
self.pathtoopf = write_oebbook(self.pathtoopf, self.base)
149
self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
150
self.language = self.opf.language
152
self.language = self.language.lower()
153
self.spine = [SpineItem(i.path) for i in self.opf.spine]
155
cover = self.opf.cover
156
if self.ebook_ext in ('lit', 'mobi', 'prc', 'opf') and cover:
157
cfile = os.path.join(os.path.dirname(self.spine[0]),
158
'calibre_iterator_cover.html')
159
chtml = (TITLEPAGE%cover).encode('utf-8')
160
open(cfile, 'wb').write(chtml)
161
self.spine[0:0] = [SpineItem(cfile)]
162
self.delete_on_exit.append(cfile)
164
if self.opf.path_to_html_toc is not None and \
165
self.opf.path_to_html_toc not in self.spine:
167
self.spine.append(SpineItem(self.opf.path_to_html_toc))
170
traceback.print_exc()
173
sizes = [i.character_count for i in self.spine]
174
self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
175
for p, s in zip(self.pages, self.spine):
182
s.max_page = s.start_page + s.pages - 1
183
self.toc = self.opf.toc
185
self.find_embedded_fonts()
186
self.read_bookmarks()
190
def parse_bookmarks(self, raw):
191
for line in raw.splitlines():
192
if line.count('^') > 0:
193
tokens = line.rpartition('^')
194
title, ref = tokens[0], tokens[2]
195
self.bookmarks.append((title, ref))
197
def serialize_bookmarks(self, bookmarks):
199
for title, bm in bookmarks:
200
dat.append(u'%s^%s'%(title, bm))
201
return (u'\n'.join(dat) +'\n').encode('utf-8')
203
def read_bookmarks(self):
205
bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
207
if os.path.exists(bmfile):
208
raw = open(bmfile, 'rb').read().decode('utf-8')
210
saved = self.config['bookmarks_'+self.pathtoebook]
213
self.parse_bookmarks(raw)
215
def save_bookmarks(self, bookmarks=None):
216
if bookmarks is None:
217
bookmarks = self.bookmarks
218
dat = self.serialize_bookmarks(bookmarks)
219
if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
220
os.access(self.pathtoebook, os.R_OK):
222
zf = open(self.pathtoebook, 'r+b')
225
zipf = ZipFile(zf, mode='a')
226
for name in zipf.namelist():
227
if name == 'META-INF/calibre_bookmarks.txt':
228
safe_replace(zf, 'META-INF/calibre_bookmarks.txt', StringIO(dat))
230
zipf.writestr('META-INF/calibre_bookmarks.txt', dat)
232
self.config['bookmarks_'+self.pathtoebook] = dat
234
def add_bookmark(self, bm):
236
for x in self.bookmarks:
240
self.bookmarks.remove(x)
241
self.bookmarks.append(bm)
242
self.save_bookmarks()
244
def set_bookmarks(self, bookmarks):
245
self.bookmarks = bookmarks
247
def __exit__(self, *args):
248
self._tdir.__exit__(*args)
249
for x in self.delete_on_exit:
250
if os.path.exists(x):