2
Container-/OPF-based input OEBBook reader.
4
from __future__ import with_statement
7
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
9
import sys, os, uuid, copy, re, cStringIO
10
from itertools import izip
11
from urlparse import urldefrag, urlparse
12
from urllib import unquote as urlunquote
13
from mimetypes import guess_type
14
from collections import defaultdict
16
from lxml import etree
19
from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
21
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
22
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
23
from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
24
ENTITY_RE, MS_COVER_TYPE, iterlinks
25
from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
26
urlnormalize, BINARY_MIME, \
27
OEBError, OEBBook, DirContainer
28
from calibre.ebooks.oeb.writer import OEBWriter
29
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
30
from calibre.startup import get_lang
31
from calibre.ptempfile import TemporaryDirectory
32
from calibre.constants import __appname__, __version__
34
__all__ = ['OEBReader']
36
class OEBReader(object):
37
"""Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
39
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
40
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
42
Container = DirContainer
43
"""Container type used to access book files. Override in sub-classes."""
45
DEFAULT_PROFILE = 'PRS505'
46
"""Default renderer profile for content read with this Reader."""
49
"""List of transforms to apply to content read with this Reader."""
53
"""Add any book-reading options to the :class:`Config` object
59
def generate(cls, opts):
60
"""Generate a Reader instance from command-line options."""
63
def __call__(self, oeb, path):
64
"""Read the book at :param:`path` into the :class:`OEBBook` object
68
self.logger = self.log = oeb.logger
69
oeb.container = self.Container(path, self.logger)
70
oeb.container.log = oeb.log
71
opf = self._read_opf()
72
self._all_from_opf(opf)
75
def _clean_opf(self, opf):
77
for elem in opf.iter(tag=etree.Element):
78
nsmap.update(elem.nsmap)
79
for elem in opf.iter(tag=etree.Element):
80
if namespace(elem.tag) in ('', OPF1_NS):
81
elem.tag = OPF(barename(elem.tag))
82
nsmap.update(OPF2_NSMAP)
83
attrib = dict(opf.attrib)
84
nroot = etree.Element(OPF('package'),
85
nsmap={None: OPF2_NS}, attrib=attrib)
86
metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
87
ignored = (OPF('dc-metadata'), OPF('x-metadata'))
88
for elem in xpath(opf, 'o2:metadata//*'):
89
if elem.tag in ignored:
91
if namespace(elem.tag) in DC_NSES:
92
tag = barename(elem.tag).lower()
93
elem.tag = '{%s}%s' % (DC11_NS, tag)
95
for element in xpath(opf, 'o2:metadata//o2:meta'):
96
metadata.append(element)
97
for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
98
for element in xpath(opf, tag):
103
data = self.oeb.container.read(None)
104
data = self.oeb.decode(data)
105
data = XMLDECL_RE.sub('', data)
107
opf = etree.fromstring(data)
108
except etree.XMLSyntaxError:
109
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
110
data = ENTITY_RE.sub(repl, data)
112
opf = etree.fromstring(data)
113
self.logger.warn('OPF contains invalid HTML named entities')
114
except etree.XMLSyntaxError:
115
data = re.sub(r'(?is)<tours>.+</tours>', '', data)
116
self.logger.warn('OPF contains invalid tours section')
117
opf = etree.fromstring(data)
119
ns = namespace(opf.tag)
120
if ns not in ('', OPF1_NS, OPF2_NS):
121
raise OEBError('Invalid namespace %r for OPF document' % ns)
122
opf = self._clean_opf(opf)
125
def _metadata_from_opf(self, opf):
126
from calibre.ebooks.metadata.opf2 import OPF
127
from calibre.ebooks.metadata import MetaInformation
128
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
129
stream = cStringIO.StringIO(etree.tostring(opf))
130
mi = MetaInformation(OPF(stream))
132
mi.language = get_lang()
133
self.oeb.metadata.add('language', mi.language)
135
mi.title = self.oeb.translate(__('Unknown'))
137
mi.authors = [self.oeb.translate(__('Unknown'))]
138
if not mi.book_producer:
139
mi.book_producer = '%(a)s (%(v)s) [http://%(a)s.kovidgoyal.net]'%\
140
dict(a=__appname__, v=__version__)
141
meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger)
142
bookid = "urn:uuid:%s" % str(uuid.uuid4()) if mi.application_id is None \
143
else mi.application_id
144
self.oeb.metadata.add('identifier', bookid, id='calibre-uuid')
145
self.oeb.uid = self.oeb.metadata.identifier[0]
147
def _manifest_prune_invalid(self):
149
Remove items from manifest that contain invalid data. This prevents
150
catastrophic conversion failure, when a few files contain corrupted
154
check = OEB_DOCS.union(OEB_STYLES)
155
for item in list(self.oeb.manifest.values()):
156
if item.media_type in check:
160
self.logger.exception('Failed to parse content in %s'%
163
self.oeb.manifest.remove(item)
166
def _manifest_add_missing(self, invalid):
167
manifest = self.oeb.manifest
168
known = set(manifest.hrefs)
169
unchecked = set(manifest.values())
173
for item in unchecked:
174
if (item.media_type in OEB_DOCS or
175
item.media_type[-4:] in ('/xml', '+xml')) and \
176
item.data is not None:
177
hrefs = [r[2] for r in iterlinks(item.data)]
179
href, _ = urldefrag(href)
182
href = item.abshref(urlnormalize(href))
183
scheme = urlparse(href).scheme
184
if not scheme and href not in known:
186
elif item.media_type in OEB_STYLES:
187
for url in cssutils.getUrls(item.data):
188
href, _ = urldefrag(url)
189
href = item.abshref(urlnormalize(href))
190
scheme = urlparse(href).scheme
191
if not scheme and href not in known:
199
if href == item.abshref(urlnormalize(href)):
204
if not self.oeb.container.exists(href):
205
if href not in warned:
206
self.logger.warn('Referenced file %r not found' % href)
209
if href not in warned:
210
self.logger.warn('Referenced file %r not in manifest' % href)
212
id, _ = manifest.generate(id='added')
213
guessed = guess_type(href)[0]
214
media_type = guessed or BINARY_MIME
215
added = manifest.add(id, href, media_type)
218
def _manifest_from_opf(self, opf):
219
manifest = self.oeb.manifest
220
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
222
href = elem.get('href')
223
media_type = elem.get('media-type', None)
224
if media_type is None:
225
media_type = elem.get('mediatype', None)
226
if media_type is None or media_type == 'text/xml':
227
guessed = guess_type(href)[0]
228
media_type = guessed or media_type or BINARY_MIME
229
if hasattr(media_type, 'lower'):
230
media_type = media_type.lower()
231
fallback = elem.get('fallback')
232
if href in manifest.hrefs:
233
self.logger.warn(u'Duplicate manifest entry for %r' % href)
235
if not self.oeb.container.exists(href):
236
self.logger.warn(u'Manifest item %r not found' % href)
238
if id in manifest.ids:
239
self.logger.warn(u'Duplicate manifest id %r' % id)
240
id, href = manifest.generate(id, href)
241
manifest.add(id, href, media_type, fallback)
242
invalid = self._manifest_prune_invalid()
243
self._manifest_add_missing(invalid)
245
def _spine_add_extra(self):
246
manifest = self.oeb.manifest
247
spine = self.oeb.spine
248
unchecked = set(spine)
249
selector = XPath('h:body//h:a/@href')
253
for item in unchecked:
254
if item.media_type not in OEB_DOCS:
255
# TODO: handle fallback chains
257
for href in selector(item.data):
258
href, _ = urldefrag(href)
261
href = item.abshref(urlnormalize(href))
262
if href not in manifest.hrefs:
264
found = manifest.hrefs[href]
265
if found.media_type not in OEB_DOCS or \
266
found in spine or found in extras:
271
version = int(self.oeb.version[0])
272
for item in sorted(extras):
275
'Spine-referenced file %r not in spine' % item.href)
276
spine.add(item, linear=False)
278
def _spine_from_opf(self, opf):
279
spine = self.oeb.spine
280
manifest = self.oeb.manifest
281
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
282
idref = elem.get('idref')
283
if idref not in manifest.ids:
284
self.logger.warn(u'Spine item %r not found' % idref)
286
item = manifest.ids[idref]
287
spine.add(item, elem.get('linear'))
289
raise OEBError("Spine is empty")
290
self._spine_add_extra()
292
def _guide_from_opf(self, opf):
293
guide = self.oeb.guide
294
manifest = self.oeb.manifest
295
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
296
href = elem.get('href')
297
path = urldefrag(href)[0]
298
if path not in manifest.hrefs:
299
self.logger.warn(u'Guide reference %r not found' % href)
301
guide.add(elem.get('type'), elem.get('title'), href)
303
def _find_ncx(self, opf):
304
result = xpath(opf, '/o2:package/o2:spine/@toc')
307
if id not in self.oeb.manifest.ids:
309
item = self.oeb.manifest.ids[id]
310
self.oeb.manifest.remove(item)
312
for item in self.oeb.manifest.values():
313
if item.media_type == NCX_MIME:
314
self.oeb.manifest.remove(item)
318
def _toc_from_navpoint(self, item, toc, navpoint):
319
children = xpath(navpoint, 'ncx:navPoint')
320
for child in children:
321
title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
322
title = COLLAPSE_RE.sub(' ', title.strip())
323
href = xpath(child, 'ncx:content/@src')
324
if not title or not href:
326
href = item.abshref(urlnormalize(href[0]))
327
path, _ = urldefrag(href)
328
if path not in self.oeb.manifest.hrefs:
329
self.logger.warn('TOC reference %r not found' % href)
332
klass = child.get('class', 'chapter')
334
po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
336
authorElement = xpath(child,
337
'descendant::calibre:meta[@name = "author"]')
339
author = authorElement[0].text
343
descriptionElement = xpath(child,
344
'descendant::calibre:meta[@name = "description"]')
345
if descriptionElement:
346
description = etree.tostring(descriptionElement[0],
347
method='text', encoding=unicode).strip()
353
node = toc.add(title, href, id=id, klass=klass,
354
play_order=po, description=description, author=author)
356
self._toc_from_navpoint(item, node, child)
358
def _toc_from_ncx(self, item):
361
self.log.debug('Reading TOC from NCX...')
363
title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
364
title = COLLAPSE_RE.sub(' ', title.strip())
365
title = title or unicode(self.oeb.metadata.title[0])
368
navmaps = xpath(ncx, 'ncx:navMap')
369
for navmap in navmaps:
370
self._toc_from_navpoint(item, toc, navmap)
373
def _toc_from_tour(self, opf):
374
result = xpath(opf, 'o2:tours/o2:tour')
377
self.log.debug('Reading TOC from tour...')
380
toc.title = tour.get('title')
381
sites = xpath(tour, 'o2:site')
383
title = site.get('title')
384
href = site.get('href')
385
if not title or not href:
387
path, _ = urldefrag(urlnormalize(href))
388
if path not in self.oeb.manifest.hrefs:
389
self.logger.warn('TOC reference %r not found' % href)
392
toc.add(title, href, id=id)
395
def _toc_from_html(self, opf):
396
if 'toc' not in self.oeb.guide:
398
self.log.debug('Reading TOC from HTML...')
399
itempath, frag = urldefrag(self.oeb.guide['toc'].href)
400
item = self.oeb.manifest.hrefs[itempath]
403
elems = xpath(html, './/*[@id="%s"]' % frag)
405
elems = xpath(html, './/*[@name="%s"]' % frag)
406
elem = elems[0] if elems else html
407
while elem != html and not xpath(elem, './/h:a[@href]'):
408
elem = elem.getparent()
410
titles = defaultdict(list)
412
for anchor in xpath(html, './/h:a[@href]'):
413
href = anchor.attrib['href']
414
href = item.abshref(urlnormalize(href))
415
path, frag = urldefrag(href)
416
if path not in self.oeb.manifest.hrefs:
418
title = ' '.join(xpath(anchor, './/text()'))
419
title = COLLAPSE_RE.sub(' ', title.strip())
420
if href not in titles:
422
titles[href].append(title)
425
toc.add(' '.join(titles[href]), href)
428
def _toc_from_spine(self, opf):
429
self.log.warn('Generating default TOC from spine...')
433
for item in self.oeb.spine:
434
if not item.linear: continue
436
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
437
title = COLLAPSE_RE.sub(' ', title.strip())
440
headers.append('(unlabled)')
441
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
442
expr = '/h:html/h:body//h:%s[position()=1]/text()'
443
header = ''.join(xpath(html, expr % tag))
444
header = COLLAPSE_RE.sub(' ', header.strip())
449
if len(titles) > len(set(titles)):
451
for title, item in izip(use, self.oeb.spine):
452
if not item.linear: continue
453
toc.add(title, item.href)
456
def _toc_from_opf(self, opf, item):
457
self.oeb.auto_generated_toc = False
458
if self._toc_from_ncx(item): return
459
# Prefer HTML to tour based TOC, since several LIT files
460
# have good HTML TOCs but bad tour based TOCs
461
if self._toc_from_html(opf): return
462
if self._toc_from_tour(opf): return
463
self._toc_from_spine(opf)
464
self.oeb.auto_generated_toc = True
466
def _pages_from_ncx(self, opf, item):
470
ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget')
473
pages = self.oeb.pages
474
for ptarget in ptargets:
475
name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
476
name = COLLAPSE_RE.sub(' ', name.strip())
477
href = xpath(ptarget, 'ncx:content/@src')
480
href = item.abshref(urlnormalize(href[0]))
481
id = ptarget.get('id')
482
type = ptarget.get('type', 'normal')
483
klass = ptarget.get('class')
484
pages.add(name, href, type=type, id=id, klass=klass)
487
def _find_page_map(self, opf):
488
result = xpath(opf, '/o2:package/o2:spine/@page-map')
491
if id not in self.oeb.manifest.ids:
493
item = self.oeb.manifest.ids[id]
494
self.oeb.manifest.remove(item)
496
for item in self.oeb.manifest.values():
497
if item.media_type == PAGE_MAP_MIME:
498
self.oeb.manifest.remove(item)
502
def _pages_from_page_map(self, opf):
503
item = self._find_page_map(opf)
507
pages = self.oeb.pages
508
for page in xpath(pmap, 'o2:page'):
509
name = page.get('name', '')
510
href = page.get('href')
513
name = COLLAPSE_RE.sub(' ', name.strip())
514
href = item.abshref(urlnormalize(href))
518
elif name.lower().strip('ivxlcdm') == '':
520
pages.add(name, href, type=type)
523
def _pages_from_opf(self, opf, item):
524
if self._pages_from_ncx(opf, item): return
525
if self._pages_from_page_map(opf): return
528
def _cover_from_html(self, hcover):
529
from calibre.ebooks import render_html_svg_workaround
530
with TemporaryDirectory('_html_cover') as tdir:
532
writer(self.oeb, tdir)
533
path = os.path.join(tdir, urlunquote(hcover.href))
534
data = render_html_svg_workaround(path, self.logger)
537
id, href = self.oeb.manifest.generate('cover', 'cover.jpeg')
538
item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
541
def _locate_cover_image(self):
542
if self.oeb.metadata.cover:
543
id = unicode(self.oeb.metadata.cover[0])
544
item = self.oeb.manifest.ids.get(id, None)
545
if item is not None and item.media_type in OEB_IMAGES:
548
self.logger.warn('Invalid cover image @id %r' % id)
549
hcover = self.oeb.spine[0]
550
if 'cover' in self.oeb.guide:
551
href = self.oeb.guide['cover'].href
552
item = self.oeb.manifest.hrefs[href]
553
media_type = item.media_type
554
if media_type in OEB_IMAGES:
556
elif media_type in OEB_DOCS:
559
if MS_COVER_TYPE in self.oeb.guide:
560
href = self.oeb.guide[MS_COVER_TYPE].href
561
item = self.oeb.manifest.hrefs.get(href, None)
562
if item is not None and item.media_type in OEB_IMAGES:
564
if self.COVER_SVG_XP(html):
565
svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
566
href = os.path.splitext(hcover.href)[0] + '.svg'
567
id, href = self.oeb.manifest.generate(hcover.id, href)
568
item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg)
570
if self.COVER_OBJECT_XP(html):
571
object = self.COVER_OBJECT_XP(html)[0]
572
href = hcover.abshref(object.get('data'))
573
item = self.oeb.manifest.hrefs.get(href, None)
574
if item is not None and item.media_type in OEB_IMAGES:
576
return self._cover_from_html(hcover)
578
def _ensure_cover_image(self):
579
cover = self._locate_cover_image()
580
if self.oeb.metadata.cover:
581
self.oeb.metadata.cover[0].value = cover.id
583
self.oeb.metadata.add('cover', cover.id)
585
def _all_from_opf(self, opf):
586
self.oeb.version = opf.get('version', '1.2')
587
self._metadata_from_opf(opf)
588
self._manifest_from_opf(opf)
589
self._spine_from_opf(opf)
590
self._guide_from_opf(opf)
591
item = self._find_ncx(opf)
592
self._toc_from_opf(opf, item)
593
self._pages_from_opf(opf, item)
594
#self._ensure_cover_image()
597
def main(argv=sys.argv):
600
oeb = reader(OEBBook(), arg)
601
for name, doc in oeb.to_opf1().values():
602
print etree.tostring(doc, pretty_print=True)
603
for name, doc in oeb.to_opf2(page_map=True).values():
604
print etree.tostring(doc, pretty_print=True)
607
if __name__ == '__main__':