2
__copyright__ = '2008 Kovid Goyal <kovid at kovidgoyal.net>'
5
Iterate over the HTML files in an ebook. Useful for writing viewers.
8
import re, os, math, copy
9
from cStringIO import StringIO
11
from PyQt4.Qt import QFontDatabase
13
from calibre.ebooks.epub.from_any import MAP
14
from calibre.ebooks.epub.from_html import TITLEPAGE
15
from calibre.ebooks.epub import config
16
from calibre.ebooks.metadata.opf2 import OPF
17
from calibre.ptempfile import TemporaryDirectory
18
from calibre.ebooks.chardet import xml_to_unicode
19
from calibre.ebooks.html import create_dir
20
from calibre.utils.zipfile import safe_replace, ZipFile
21
from calibre.utils.config import DynamicConfig
23
def character_count(html):
25
Return the number of "significant" text characters in a HTML string.
28
strip_space = re.compile(r'\s+')
29
for match in re.finditer(r'>[^<]+<', html):
30
count += len(strip_space.sub(' ', match.group()))-2
33
class UnsupportedFormatError(Exception):
35
def __init__(self, fmt):
36
Exception.__init__(self, _('%s format books are not supported')%fmt.upper())
38
class SpineItem(unicode):
40
def __new__(cls, *args):
42
args[0] = args[0].partition('#')[0]
43
obj = super(SpineItem, cls).__new__(cls, *args)
45
raw = open(path, 'rb').read()
46
raw, obj.encoding = xml_to_unicode(raw)
47
obj.character_count = character_count(raw)
53
def html2opf(path, tdir, opts):
54
opts = copy.copy(opts)
56
create_dir(path, opts)
57
return os.path.join(tdir, 'metadata.opf')
59
def opf2opf(path, tdir, opts):
62
def is_supported(path):
63
ext = os.path.splitext(path)[1].replace('.', '').lower()
64
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
65
return ext in list(MAP.keys())+['html', 'opf']
67
class EbookIterator(object):
69
CHARACTERS_PER_PAGE = 1000
71
def __init__(self, pathtoebook):
72
pathtoebook = pathtoebook.strip()
73
self.pathtoebook = os.path.abspath(pathtoebook)
74
self.config = DynamicConfig(name='iterator')
75
ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower()
76
ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext)
78
map['html'] = html2opf
80
if ext not in map.keys():
81
raise UnsupportedFormatError(ext)
82
self.to_opf = map[ext]
84
def search(self, text, index):
86
for i, path in enumerate(self.spine):
88
if text in open(path, 'rb').read().decode(path.encoding).lower():
91
def find_embedded_fonts(self):
93
This will become unnecessary once Qt WebKit supports the @font-face rule.
95
for item in self.opf.manifest:
96
if item.mime_type and 'css' in item.mime_type.lower():
97
css = open(item.path, 'rb').read().decode('utf-8', 'replace')
98
for match in re.compile(r'@font-face\s*{([^}]+)}').finditer(css):
99
block = match.group(1)
100
family = re.compile(r'font-family\s*:\s*([^;]+)').search(block)
101
url = re.compile(r'url\s*\([\'"]*(.+?)[\'"]*\)', re.DOTALL).search(block)
103
path = url.group(1).split('/')
104
path = os.path.join(os.path.dirname(item.path), *path)
105
id = QFontDatabase.addApplicationFont(path)
107
families = [unicode(f) for f in QFontDatabase.applicationFontFamilies(id)]
109
family = family.group(1).strip().replace('"', '')
110
if family not in families:
111
print 'WARNING: Family aliasing not supported:', block
113
print 'Loaded embedded font:', repr(family)
116
self._tdir = TemporaryDirectory('_ebook_iter')
117
self.base = self._tdir.__enter__()
118
opts = config('').parse()
119
self.pathtoopf = self.to_opf(self.pathtoebook, self.base, opts)
120
self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf))
121
self.spine = [SpineItem(i.path) for i in self.opf.spine]
123
cover = self.opf.cover
124
if os.path.splitext(self.pathtoebook)[1].lower() in \
125
('.lit', '.mobi', '.prc') and cover:
126
cfile = os.path.join(os.path.dirname(self.spine[0]), 'calibre_ei_cover.html')
127
open(cfile, 'wb').write(TITLEPAGE%cover)
128
self.spine[0:0] = [SpineItem(cfile)]
130
if self.opf.path_to_html_toc is not None and \
131
self.opf.path_to_html_toc not in self.spine:
132
self.spine.append(SpineItem(self.opf.path_to_html_toc))
135
sizes = [i.character_count for i in self.spine]
136
self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes]
137
for p, s in zip(self.pages, self.spine):
145
s.max_page = s.start_page + s.pages - 1
146
self.toc = self.opf.toc
148
self.find_embedded_fonts()
149
self.read_bookmarks()
153
def parse_bookmarks(self, raw):
154
for line in raw.splitlines():
155
if line.count('^') > 0:
156
tokens = line.rpartition('^')
157
title, ref = tokens[0], tokens[2]
158
self.bookmarks.append((title, ref))
160
def serialize_bookmarks(self, bookmarks):
162
for title, bm in bookmarks:
163
dat.append(u'%s^%s'%(title, bm))
164
return (u'\n'.join(dat) +'\n').encode('utf-8')
166
def read_bookmarks(self):
168
bmfile = os.path.join(self.base, 'META-INF', 'calibre_bookmarks.txt')
170
if os.path.exists(bmfile):
171
raw = open(bmfile, 'rb').read().decode('utf-8')
173
saved = self.config['bookmarks_'+self.pathtoebook]
176
self.parse_bookmarks(raw)
178
def save_bookmarks(self, bookmarks=None):
179
if bookmarks is None:
180
bookmarks = self.bookmarks
181
dat = self.serialize_bookmarks(bookmarks)
182
if os.path.splitext(self.pathtoebook)[1].lower() == '.epub' and \
183
os.access(self.pathtoebook, os.R_OK):
185
zf = open(self.pathtoebook, 'r+b')
188
zipf = ZipFile(zf, mode='a')
189
for name in zipf.namelist():
190
if name == 'META-INF/calibre_bookmarks.txt':
191
safe_replace(zf, 'META-INF/calibre_bookmarks.txt', StringIO(dat))
193
zipf.writestr('META-INF/calibre_bookmarks.txt', dat)
195
self.config['bookmarks_'+self.pathtoebook] = dat
197
def add_bookmark(self, bm):
199
for x in self.bookmarks:
203
self.bookmarks.remove(x)
204
self.bookmarks.append(bm)
205
self.save_bookmarks()
207
def __exit__(self, *args):
208
self._tdir.__exit__(*args)