2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3
from __future__ import with_statement
6
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
7
__docformat__ = 'restructuredtext en'
11
from lxml import etree
12
from urlparse import urlparse
14
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML
15
from calibre.ebooks import ConversionError
19
return etree.XPath(x, namespaces=XPNSMAP)
20
except etree.XPathSyntaxError:
21
raise ConversionError(
22
'The syntax of the XPath expression %s is invalid.' % repr(x))
24
class DetectStructure(object):
26
def __call__(self, oeb, opts):
30
self.log('Detecting structure...')
32
self.detect_chapters()
33
if self.oeb.auto_generated_toc or opts.use_auto_toc:
34
orig_toc = self.oeb.toc
36
self.create_level_based_toc()
37
if self.oeb.toc.count() < 1:
38
if not opts.no_chapters_in_toc and self.detected_chapters:
39
self.create_toc_from_chapters()
40
if self.oeb.toc.count() < opts.toc_threshold:
41
self.create_toc_from_links()
42
if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
43
self.oeb.toc = orig_toc
45
self.oeb.auto_generated_toc = True
46
self.log('Auto generated TOC with %d entries.' %
49
if opts.toc_filter is not None:
50
regexp = re.compile(opts.toc_filter)
51
for node in self.oeb.toc.iter():
52
if not node.title or regexp.search(node.title) is not None:
53
self.oeb.toc.remove(node)
55
if opts.page_breaks_before is not None:
56
pb_xpath = XPath(opts.page_breaks_before)
57
for item in oeb.spine:
58
for elem in pb_xpath(item.data):
59
style = elem.get('style', '')
62
elem.set('style', style+'page-break-before:always')
64
for node in self.oeb.toc.iter():
65
if not node.title or not node.title.strip():
66
node.title = _('Unnamed')
68
def detect_chapters(self):
69
self.detected_chapters = []
71
chapter_xpath = XPath(self.opts.chapter)
72
for item in self.oeb.spine:
73
for x in chapter_xpath(item.data):
74
self.detected_chapters.append((item, x))
76
chapter_mark = self.opts.chapter_mark
77
page_break_before = 'display: block; page-break-before: always'
78
page_break_after = 'display: block; page-break-after: always'
79
for item, elem in self.detected_chapters:
80
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
81
self.log('\tDetected chapter:', text[:50])
82
if chapter_mark == 'none':
84
elif chapter_mark == 'rule':
85
mark = etree.Element(XHTML('hr'))
86
elif chapter_mark == 'pagebreak':
87
mark = etree.Element(XHTML('div'), style=page_break_after)
88
else: # chapter_mark == 'both':
89
mark = etree.Element(XHTML('hr'), style=page_break_before)
90
elem.addprevious(mark)
92
def create_level_based_toc(self):
93
if self.opts.level1_toc is None:
95
for item in self.oeb.spine:
96
self.add_leveled_toc_items(item)
98
def create_toc_from_chapters(self):
99
counter = self.oeb.toc.next_play_order()
100
for item, elem in self.detected_chapters:
101
text, href = self.elem_to_link(item, elem, counter)
102
self.oeb.toc.add(text, href, play_order=counter)
105
def create_toc_from_links(self):
106
for item in self.oeb.spine:
107
for a in XPath('//h:a[@href]')(item.data):
109
purl = urlparse(href)
110
if not purl[0] or purl[0] == 'file':
111
href, frag = purl.path, purl.fragment
112
href = item.abshref(href)
114
href = '#'.join((href, frag))
115
if not self.oeb.toc.has_href(href):
116
text = u' '.join([t.strip() for t in \
117
a.xpath('descendant::text()')])
118
text = text[:100].strip()
119
if not self.oeb.toc.has_text(text):
120
self.oeb.toc.add(text, href,
121
play_order=self.oeb.toc.next_play_order())
125
def elem_to_link(self, item, elem, counter):
126
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
127
text = text[:100].strip()
128
id = elem.get('id', 'calibre_toc_%d'%counter)
130
href = '#'.join((item.href, id))
134
def add_leveled_toc_items(self, item):
135
level1 = XPath(self.opts.level1_toc)(item.data)
143
text, _href = self.elem_to_link(document, elem, counter)
146
node = self.oeb.toc.add(text, _href,
147
play_order=self.oeb.toc.next_play_order())
148
level1_order.append(node)
150
#node.add(_('Top'), _href)
151
if self.opts.level2_toc is not None:
153
level2 = list(XPath(self.opts.level2_toc)(document.data))
156
for item in document.data.iterdescendants():
157
if item in added.keys():
159
elif item == elem and level1 is not None:
160
text, _href = self.elem_to_link(document, elem, counter)
163
added2[elem] = level1.add(text, _href,
164
play_order=self.oeb.toc.next_play_order())
165
if self.opts.level3_toc is not None:
166
level3 = list(XPath(self.opts.level3_toc)(document.data))
169
for item in document.data.iterdescendants():
170
if item in added2.keys():
171
level2 = added2[item]
172
elif item == elem and level2 is not None:
174
self.elem_to_link(document, elem, counter)
177
level2.add(text, _href,
178
play_order=self.oeb.toc.next_play_order())