504
755
self._data = data
506
757
def __repr__(self):
507
return 'Item(id=%r, href=%r, media_type=%r)' \
758
return u'Item(id=%r, href=%r, media_type=%r)' \
508
759
% (self.id, self.href, self.media_type)
510
def _force_xhtml(self, data):
761
def _parse_xml(self, data):
763
return etree.fromstring(data)
764
except etree.XMLSyntaxError, err:
765
if getattr(err, 'code', 0) == 26 or str(err).startswith('Entity'):
766
data = xml_to_unicode(data, strip_encoding_pats=True,
767
resolve_entities=True)[0]
768
return etree.fromstring(data)
770
def _parse_xhtml(self, data):
771
self.oeb.log.debug('Parsing', self.href, '...')
511
772
# Convert to Unicode and normalize line endings
512
773
data = self.oeb.decode(data)
513
data = XMLDECL_RE.sub('', data)
514
# Handle broken XHTML w/ SVG (ugh)
515
if 'svg:' in data and SVG_NS not in data:
517
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
518
if 'xlink:' in data and XLINK_NS not in data:
520
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
774
data = self.oeb.html_preprocessor(data)
776
# Remove DOCTYPE declaration as it messes up parsing
777
# Inparticular it causes tostring to insert xmlns
778
# declarations, which messes up the coercing logic
779
idx = data.find('<html')
783
if '<!DOCTYPE' in pre:
785
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
787
if val.startswith('"') and val.endswith('"'):
789
user_entities[match.group(1)] = val
791
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
792
data = pat.sub(lambda m:user_entities[m.group(1)], data)
521
794
# Try with more & more drastic measures to parse
523
data = etree.fromstring(data)
524
except etree.XMLSyntaxError:
525
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
526
data = ENTITY_RE.sub(repl, data)
795
def first_pass(data):
528
797
data = etree.fromstring(data)
529
798
except etree.XMLSyntaxError:
530
# TODO: Factor out HTML->XML coercion
531
self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
532
data = html.fromstring(data)
533
data.attrib.pop('xmlns', None)
534
for elem in data.iter(tag=etree.Comment):
536
elem.text = elem.text.strip('-')
537
data = etree.tostring(data, encoding=unicode)
799
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
800
data = ENTITY_RE.sub(repl, data)
539
802
data = etree.fromstring(data)
540
803
except etree.XMLSyntaxError:
541
data = etree.fromstring(data, parser=RECOVER_PARSER)
804
self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
805
data = html.fromstring(data)
806
data.attrib.pop('xmlns', None)
807
for elem in data.iter(tag=etree.Comment):
809
elem.text = elem.text.strip('-')
810
data = etree.tostring(data, encoding=unicode)
812
data = etree.fromstring(data)
813
except etree.XMLSyntaxError:
814
data = etree.fromstring(data, parser=RECOVER_PARSER)
816
data = first_pass(data)
818
# Handle weird (non-HTML/fragment) files
819
if barename(data.tag) != 'html':
820
self.oeb.log.warn('File %r does not appear to be (X)HTML'%self.href)
821
nroot = etree.fromstring('<html></html>')
823
for child in list(data):
824
if barename(child.tag) == 'body':
829
self.oeb.log.warn('File %r appears to be a HTML fragment'%self.href)
830
nroot = etree.fromstring('<html><body/></html>')
832
for child in list(data):
833
child.getparent().remove(child)
542
837
# Force into the XHTML namespace
543
if barename(data.tag) != 'html':
545
'File %r does not appear to be (X)HTML' % self.href)
546
elif not namespace(data.tag):
838
if not namespace(data.tag):
547
839
data.attrib['xmlns'] = XHTML_NS
548
840
data = etree.tostring(data, encoding=unicode)
550
842
data = etree.fromstring(data)
552
844
data=data.replace(':=', '=').replace(':>', '>')
553
data = etree.fromstring(data)
846
data = etree.fromstring(data)
847
except etree.XMLSyntaxError:
848
self.oeb.logger.warn('Stripping comments and meta tags from %s'%
850
data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
852
data = re.sub(r'<meta\s+[^>]+?>', '', data)
853
data = etree.fromstring(data)
554
854
elif namespace(data.tag) != XHTML_NS:
555
855
# OEB_DOC_NS, but possibly others
556
856
ns = namespace(data.tag)
1052
1635
class OEBBook(object):
1636
"""Representation of a book in the IDPF OEB data model."""
1054
1638
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
1055
1639
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
1057
def __init__(self, opfpath=None, container=None, encoding=None,
1058
logger=FauxLogger()):
1059
if opfpath and not container:
1060
container = DirContainer(os.path.dirname(opfpath))
1061
opfpath = os.path.basename(opfpath)
1062
self.container = container
1641
def __init__(self, logger,
1643
css_preprocessor=CSSPreProcessor(),
1644
encoding='utf-8', pretty_print=False,
1645
input_encoding='utf-8'):
1646
"""Create empty book. Arguments:
1648
:param:`encoding`: Default encoding for textual content read
1649
from an external container.
1650
:param:`pretty_print`: Whether or not the canonical string form
1651
of XML markup is pretty-printed.
1652
:param html_preprocessor: A callable that takes a unicode object
1653
and returns a unicode object. Will be called on all html files
1654
before they are parsed.
1655
:param css_preprocessor: A callable that takes a unicode object
1656
and returns a unicode object. Will be called on all CSS files
1657
before they are parsed.
1658
:param:`logger`: A Log object to use for logging all messages
1659
related to the processing of this book. It is accessible
1660
via the instance data members :attr:`logger,log`.
1662
It provides the following public instance data members for
1663
accessing various parts of the OEB data model:
1665
:attr:`metadata`: Metadata such as title, author name(s), etc.
1666
:attr:`manifest`: Manifest of all files included in the book,
1667
including MIME types and fallback information.
1668
:attr:`spine`: In-order list of manifest items which compose
1669
the textual content of the book.
1670
:attr:`guide`: Collection of references to standard positions
1671
within the text, such as the cover, preface, etc.
1672
:attr:`toc`: Hierarchical table of contents.
1673
:attr:`pages`: List of "pages," such as indexed to a print edition of
1676
_css_log_handler.log = logger
1063
1677
self.encoding = encoding
1064
self.logger = logger
1065
if opfpath or container:
1066
opf = self._read_opf(opfpath)
1067
self._all_from_opf(opf)
1069
def _clean_opf(self, opf):
1071
for elem in opf.iter(tag=etree.Element):
1072
nsmap.update(elem.nsmap)
1073
for elem in opf.iter(tag=etree.Element):
1074
if namespace(elem.tag) in ('', OPF1_NS):
1075
elem.tag = OPF(barename(elem.tag))
1076
nsmap.update(Metadata.OPF2_NSMAP)
1077
attrib = dict(opf.attrib)
1078
nroot = etree.Element(OPF('package'),
1079
nsmap={None: OPF2_NS}, attrib=attrib)
1080
metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
1081
ignored = (OPF('dc-metadata'), OPF('x-metadata'))
1082
for elem in xpath(opf, 'o2:metadata//*'):
1083
if elem.tag in ignored:
1085
if namespace(elem.tag) in DC_NSES:
1086
tag = barename(elem.tag).lower()
1087
elem.tag = '{%s}%s' % (DC11_NS, tag)
1088
metadata.append(elem)
1089
for element in xpath(opf, 'o2:metadata//o2:meta'):
1090
metadata.append(element)
1091
for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
1092
for element in xpath(opf, tag):
1093
nroot.append(element)
1096
def _read_opf(self, opfpath):
1097
data = self.container.read(opfpath)
1098
data = self.decode(data)
1099
data = XMLDECL_RE.sub('', data)
1101
opf = etree.fromstring(data)
1102
except etree.XMLSyntaxError:
1103
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
1104
data = ENTITY_RE.sub(repl, data)
1105
opf = etree.fromstring(data)
1106
self.logger.warn('OPF contains invalid HTML named entities')
1107
ns = namespace(opf.tag)
1108
if ns not in ('', OPF1_NS, OPF2_NS):
1109
raise OEBError('Invalid namespace %r for OPF document' % ns)
1110
opf = self._clean_opf(opf)
1113
def _metadata_from_opf(self, opf):
1114
uid = opf.get('unique-identifier', None)
1678
self.input_encoding = input_encoding
1679
self.html_preprocessor = html_preprocessor
1680
self.css_preprocessor = css_preprocessor
1681
self.pretty_print = pretty_print
1682
self.logger = self.log = logger
1683
self.version = '2.0'
1684
self.container = NullContainer(self.log)
1685
self.metadata = Metadata(self)
1115
1686
self.uid = None
1116
self.metadata = metadata = Metadata(self)
1117
for elem in xpath(opf, '/o2:package/o2:metadata//*'):
1120
attrib = dict(elem.attrib)
1122
if term == OPF('meta'):
1123
term = qname(attrib.pop('name', None), nsmap)
1124
value = attrib.pop('content', None)
1126
value = COLLAPSE_RE.sub(' ', value.strip())
1127
if term and (value or attrib):
1128
metadata.add(term, value, attrib, nsmap=nsmap)
1129
haveuuid = haveid = False
1130
for ident in metadata.identifier:
1131
if unicode(ident).startswith('urn:uuid:'):
1133
if 'id' in ident.attrib:
1135
if not (haveuuid and haveid):
1136
bookid = "urn:uuid:%s" % str(uuid.uuid4())
1137
metadata.add('identifier', bookid, id='calibre-uuid')
1139
self.logger.warn(u'Unique-identifier not specified')
1140
for item in metadata.identifier:
1143
if uid is None or item.id == uid:
1147
self.logger.warn(u'Unique-identifier %r not found' % uid)
1148
for ident in metadata.identifier:
1149
if 'id' in ident.attrib:
1150
self.uid = metadata.identifier[0]
1152
if not metadata.language:
1153
self.logger.warn(u'Language not specified')
1154
metadata.add('language', get_lang())
1155
if not metadata.creator:
1156
self.logger.warn('Creator not specified')
1157
metadata.add('creator', self.translate(__('Unknown')))
1158
if not metadata.title:
1159
self.logger.warn('Title not specified')
1160
metadata.add('title', self.translate(__('Unknown')))
1162
def _manifest_add_missing(self):
1163
manifest = self.manifest
1164
known = set(manifest.hrefs)
1165
unchecked = set(manifest.values())
1168
for item in unchecked:
1169
if (item.media_type in OEB_DOCS or
1170
item.media_type[-4:] in ('/xml', '+xml')) and \
1171
item.data is not None:
1172
hrefs = [sel(item.data) for sel in LINK_SELECTORS]
1173
for href in chain(*hrefs):
1174
href, _ = urldefrag(href)
1177
href = item.abshref(urlnormalize(href))
1178
scheme = urlparse(href).scheme
1179
if not scheme and href not in known:
1181
elif item.media_type in OEB_STYLES:
1182
for match in CSSURL_RE.finditer(item.data):
1183
href, _ = urldefrag(match.group('url'))
1184
href = item.abshref(urlnormalize(href))
1185
scheme = urlparse(href).scheme
1186
if not scheme and href not in known:
1191
if not self.container.exists(href):
1192
self.logger.warn('Referenced file %r not found' % href)
1194
self.logger.warn('Referenced file %r not in manifest' % href)
1195
id, _ = manifest.generate(id='added')
1196
guessed = guess_type(href)[0]
1197
media_type = guessed or BINARY_MIME
1198
added = manifest.add(id, href, media_type)
1199
unchecked.add(added)
1201
def _manifest_from_opf(self, opf):
1202
self.manifest = manifest = Manifest(self)
1203
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
1205
href = elem.get('href')
1206
media_type = elem.get('media-type', None)
1207
if media_type is None:
1208
media_type = elem.get('mediatype', None)
1209
if media_type is None or media_type == 'text/xml':
1210
guessed = guess_type(href)[0]
1211
media_type = guessed or media_type or BINARY_MIME
1212
fallback = elem.get('fallback')
1213
if href in manifest.hrefs:
1214
self.logger.warn(u'Duplicate manifest entry for %r' % href)
1216
if not self.container.exists(href):
1217
self.logger.warn(u'Manifest item %r not found' % href)
1219
if id in manifest.ids:
1220
self.logger.warn(u'Duplicate manifest id %r' % id)
1221
id, href = manifest.generate(id, href)
1222
manifest.add(id, href, media_type, fallback)
1223
self._manifest_add_missing()
1225
def _spine_add_extra(self):
1226
manifest = self.manifest
1228
unchecked = set(spine)
1229
selector = XPath('h:body//h:a/@href')
1233
for item in unchecked:
1234
if item.media_type not in OEB_DOCS:
1235
# TODO: handle fallback chains
1237
for href in selector(item.data):
1238
href, _ = urldefrag(href)
1241
href = item.abshref(urlnormalize(href))
1242
if href not in manifest.hrefs:
1244
found = manifest.hrefs[href]
1245
if found.media_type not in OEB_DOCS or \
1246
found in spine or found in extras:
1251
version = int(self.version[0])
1252
for item in sorted(extras):
1255
'Spine-referenced file %r not in spine' % item.href)
1256
spine.add(item, linear=False)
1258
def _spine_from_opf(self, opf):
1259
self.spine = spine = Spine(self)
1260
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
1261
idref = elem.get('idref')
1262
if idref not in self.manifest:
1263
self.logger.warn(u'Spine item %r not found' % idref)
1265
item = self.manifest[idref]
1266
spine.add(item, elem.get('linear'))
1268
raise OEBError("Spine is empty")
1269
self._spine_add_extra()
1271
def _guide_from_opf(self, opf):
1272
self.guide = guide = Guide(self)
1273
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
1274
href = elem.get('href')
1275
path = urldefrag(href)[0]
1276
if path not in self.manifest.hrefs:
1277
self.logger.warn(u'Guide reference %r not found' % href)
1279
guide.add(elem.get('type'), elem.get('title'), href)
1281
def _find_ncx(self, opf):
1282
result = xpath(opf, '/o2:package/o2:spine/@toc')
1285
if id not in self.manifest.ids:
1287
item = self.manifest.ids[id]
1288
self.manifest.remove(item)
1290
for item in self.manifest.values():
1291
if item.media_type == NCX_MIME:
1292
self.manifest.remove(item)
1296
def _toc_from_navpoint(self, item, toc, navpoint):
1297
children = xpath(navpoint, 'ncx:navPoint')
1298
for child in children:
1299
title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
1300
title = COLLAPSE_RE.sub(' ', title.strip())
1301
href = xpath(child, 'ncx:content/@src')
1302
if not title or not href:
1304
href = item.abshref(urlnormalize(href[0]))
1305
path, _ = urldefrag(href)
1306
if path not in self.manifest.hrefs:
1307
self.logger.warn('TOC reference %r not found' % href)
1309
id = child.get('id')
1310
klass = child.get('class')
1311
node = toc.add(title, href, id=id, klass=klass)
1312
self._toc_from_navpoint(item, node, child)
1314
def _toc_from_ncx(self, item):
1318
title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
1319
title = COLLAPSE_RE.sub(' ', title.strip())
1320
title = title or unicode(self.metadata.title[0])
1321
self.toc = toc = TOC(title)
1322
navmaps = xpath(ncx, 'ncx:navMap')
1323
for navmap in navmaps:
1324
self._toc_from_navpoint(item, toc, navmap)
1327
def _toc_from_tour(self, opf):
1328
result = xpath(opf, 'o2:tours/o2:tour')
1332
self.toc = toc = TOC(tour.get('title'))
1333
sites = xpath(tour, 'o2:site')
1335
title = site.get('title')
1336
href = site.get('href')
1337
if not title or not href:
1339
path, _ = urldefrag(urlnormalize(href))
1340
if path not in self.manifest.hrefs:
1341
self.logger.warn('TOC reference %r not found' % href)
1344
toc.add(title, href, id=id)
1347
def _toc_from_html(self, opf):
1348
if 'toc' not in self.guide:
1350
self.toc = toc = TOC()
1351
itempath, frag = urldefrag(self.guide['toc'].href)
1352
item = self.manifest.hrefs[itempath]
1355
elems = xpath(html, './/*[@id="%s"]' % frag)
1357
elems = xpath(html, './/*[@name="%s"]' % frag)
1358
elem = elems[0] if elems else html
1359
while elem != html and not xpath(elem, './/h:a[@href]'):
1360
elem = elem.getparent()
1362
titles = defaultdict(list)
1364
for anchor in xpath(html, './/h:a[@href]'):
1365
href = anchor.attrib['href']
1366
href = item.abshref(urlnormalize(href))
1367
path, frag = urldefrag(href)
1368
if path not in self.manifest.hrefs:
1370
title = ' '.join(xpath(anchor, './/text()'))
1371
title = COLLAPSE_RE.sub(' ', title.strip())
1372
if href not in titles:
1374
titles[href].append(title)
1376
toc.add(' '.join(titles[href]), href)
1379
def _toc_from_spine(self, opf):
1380
self.toc = toc = TOC()
1383
for item in self.spine:
1384
if not item.linear: continue
1386
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
1387
title = COLLAPSE_RE.sub(' ', title.strip())
1389
titles.append(title)
1390
headers.append('(unlabled)')
1391
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
1392
expr = '/h:html/h:body//h:%s[position()=1]/text()'
1393
header = ''.join(xpath(html, expr % tag))
1394
header = COLLAPSE_RE.sub(' ', header.strip())
1396
headers[-1] = header
1399
if len(titles) > len(set(titles)):
1401
for title, item in izip(use, self.spine):
1402
if not item.linear: continue
1403
toc.add(title, item.href)
1406
def _toc_from_opf(self, opf, item):
1407
if self._toc_from_ncx(item): return
1408
if self._toc_from_tour(opf): return
1409
self.logger.warn('No metadata table of contents found')
1410
if self._toc_from_html(opf): return
1411
self._toc_from_spine(opf)
1413
def _pages_from_ncx(self, opf, item):
1417
ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget')
1420
pages = self.pages = PageList()
1421
for ptarget in ptargets:
1422
name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
1423
name = COLLAPSE_RE.sub(' ', name.strip())
1424
href = xpath(ptarget, 'ncx:content/@src')
1427
href = item.abshref(urlnormalize(href[0]))
1428
id = ptarget.get('id')
1429
type = ptarget.get('type', 'normal')
1430
klass = ptarget.get('class')
1431
pages.add(name, href, type=type, id=id, klass=klass)
1434
def _find_page_map(self, opf):
1435
result = xpath(opf, '/o2:package/o2:spine/@page-map')
1438
if id not in self.manifest.ids:
1440
item = self.manifest.ids[id]
1441
self.manifest.remove(item)
1443
for item in self.manifest.values():
1444
if item.media_type == PAGE_MAP_MIME:
1445
self.manifest.remove(item)
1449
def _pages_from_page_map(self, opf):
1450
item = self._find_page_map(opf)
1454
pages = self.pages = PageList()
1455
for page in xpath(pmap, 'o2:page'):
1456
name = page.get('name', '')
1457
href = page.get('href')
1460
name = COLLAPSE_RE.sub(' ', name.strip())
1461
href = item.abshref(urlnormalize(href))
1465
elif name.lower().strip('ivxlcdm') == '':
1467
pages.add(name, href, type=type)
1470
def _pages_from_opf(self, opf, item):
1471
if self._pages_from_ncx(opf, item): return
1472
if self._pages_from_page_map(opf): return
1687
self.manifest = Manifest(self)
1688
self.spine = Spine(self)
1689
self.guide = Guide(self)
1473
1691
self.pages = PageList()
1476
def _cover_from_html(self, hcover):
1477
with TemporaryDirectory('_html_cover') as tdir:
1478
writer = DirWriter()
1479
writer.dump(self, tdir)
1480
path = os.path.join(tdir, urlunquote(hcover.href))
1481
renderer = CoverRenderer(path)
1482
data = renderer.image_data
1483
id, href = self.manifest.generate('cover', 'cover.jpeg')
1484
item = self.manifest.add(id, href, JPEG_MIME, data=data)
1487
def _locate_cover_image(self):
1488
if self.metadata.cover:
1489
id = str(self.metadata.cover[0])
1490
item = self.manifest.ids.get(id, None)
1491
if item is not None and item.media_type in OEB_IMAGES:
1494
self.logger.warn('Invalid cover image @id %r' % id)
1495
hcover = self.spine[0]
1496
if 'cover' in self.guide:
1497
href = self.guide['cover'].href
1498
item = self.manifest.hrefs[href]
1499
media_type = item.media_type
1500
if media_type in OEB_IMAGES:
1502
elif media_type in OEB_DOCS:
1505
if MS_COVER_TYPE in self.guide:
1506
href = self.guide[MS_COVER_TYPE].href
1507
item = self.manifest.hrefs.get(href, None)
1508
if item is not None and item.media_type in OEB_IMAGES:
1510
if self.COVER_SVG_XP(html):
1511
svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
1512
href = os.path.splitext(hcover.href)[0] + '.svg'
1513
id, href = self.manifest.generate(hcover.id, href)
1514
item = self.manifest.add(id, href, SVG_MIME, data=svg)
1516
if self.COVER_OBJECT_XP(html):
1517
object = self.COVER_OBJECT_XP(html)[0]
1518
href = hcover.abshref(object.get('data'))
1519
item = self.manifest.hrefs.get(href, None)
1520
if item is not None and item.media_type in OEB_IMAGES:
1522
return self._cover_from_html(hcover)
1524
def _ensure_cover_image(self):
1525
cover = self._locate_cover_image()
1526
if self.metadata.cover:
1527
self.metadata.cover[0].value = cover.id
1529
self.metadata.add('cover', cover.id)
1531
def _all_from_opf(self, opf):
1532
self.version = opf.get('version', '1.2')
1533
self._metadata_from_opf(opf)
1534
self._manifest_from_opf(opf)
1535
self._spine_from_opf(opf)
1536
self._guide_from_opf(opf)
1537
item = self._find_ncx(opf)
1538
self._toc_from_opf(opf, item)
1539
self._pages_from_opf(opf, item)
1540
self._ensure_cover_image()
1692
self.auto_generated_toc = True
1695
def generate(cls, opts):
1696
"""Generate an OEBBook instance from command-line options."""
1697
encoding = opts.encoding
1698
pretty_print = opts.pretty_print
1699
return cls(encoding=encoding, pretty_print=pretty_print)
1542
1701
def translate(self, text):
1702
"""Translate :param:`text` into the book's primary language."""
1543
1703
lang = str(self.metadata.language[0])
1544
1704
lang = lang.split('-', 1)[0].lower()
1545
1705
return translate(lang, text)
1547
1707
def decode(self, data):
1708
"""Automatically decode :param:`data` into a `unicode` object."""
1710
return d.replace('\r\n', '\n').replace('\r', '\n')
1548
1711
if isinstance(data, unicode):
1712
return fix_data(data)
1550
1713
if data[:2] in ('\xff\xfe', '\xfe\xff'):
1552
return data.decode('utf-16')
1715
return fix_data(data.decode('utf-16'))
1716
except UnicodeDecodeError:
1718
if self.input_encoding is not None:
1720
return fix_data(data.decode(self.input_encoding, 'replace'))
1553
1721
except UnicodeDecodeError:
1556
return data.decode('utf-8')
1724
return fix_data(data.decode('utf-8'))
1557
1725
except UnicodeDecodeError:
1559
if self.encoding is not None:
1561
return data.decode(self.encoding)
1562
except UnicodeDecodeError:
1564
1727
data, _ = xml_to_unicode(data)
1565
data = data.replace('\r\n', '\n')
1566
data = data.replace('\r', '\n')
1728
return fix_data(data)
1569
1730
def to_opf1(self):
1731
"""Produce OPF 1.2 representing the book's metadata and structure.
1733
Returns a dictionary in which the keys are MIME types and the values
1734
are tuples of (default) filenames and lxml.etree element structures.
1570
1736
package = etree.Element('package',
1571
1737
attrib={'unique-identifier': self.uid.id})
1572
1738
self.metadata.to_opf1(package)