1
from __future__ import with_statement
3
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
4
__docformat__ = 'restructuredtext en'
7
Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
8
forces at "likely" locations to conform to size limitations. This transform
9
assumes a prior call to the flatcss transform.
12
import os, math, functools, collections, re, copy
14
from lxml.etree import XPath as _XPath
15
from lxml import etree
16
from lxml.cssselect import CSSSelector
18
from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP as NAMESPACES, \
19
urldefrag, rewrite_links, urlunquote, barename, XHTML
20
from calibre.ebooks.epub import rules
22
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
24
SPLIT_POINT_ATTR = 'csp'
27
return etree.tostring(root, encoding='utf-8')
29
class SplitError(ValueError):
31
def __init__(self, path, root):
32
size = len(tostring(root))/1024.
33
ValueError.__init__(self,
34
_('Could not find reasonable point at which to split: '
35
'%s Sub-tree size: %d KB')%
40
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
42
self.split_on_page_breaks = split_on_page_breaks
43
self.page_breaks_xpath = page_breaks_xpath
44
self.max_flow_size = max_flow_size
45
self.page_break_selectors = None
46
if self.page_breaks_xpath is not None:
47
self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)]
49
def __call__(self, oeb, opts):
54
for item in list(self.oeb.manifest.items):
55
if item.spine_position is not None and etree.iselement(item.data):
60
def split_item(self, item):
61
page_breaks, page_break_ids = [], []
62
if self.split_on_page_breaks:
63
page_breaks, page_break_ids = self.find_page_breaks(item)
65
splitter = FlowSplitter(item, page_breaks, page_break_ids,
66
self.max_flow_size, self.oeb, self.opts)
67
if splitter.was_split:
68
am = splitter.anchor_map
69
self.map[item.href] = collections.defaultdict(
70
am.default_factory, **am)
72
def find_page_breaks(self, item):
73
if self.page_break_selectors is None:
74
self.page_break_selectors = set([])
75
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
77
for rule in rules(stylesheets):
78
before = getattr(rule.style.getPropertyCSSValue(
79
'page-break-before'), 'cssText', '').strip().lower()
80
after = getattr(rule.style.getPropertyCSSValue(
81
'page-break-after'), 'cssText', '').strip().lower()
83
if before and before != 'avoid':
84
self.page_break_selectors.add((CSSSelector(rule.selectorText),
89
if after and after != 'avoid':
90
self.page_break_selectors.add((CSSSelector(rule.selectorText),
96
for selector, before in self.page_break_selectors:
97
body = item.data.xpath('//h:body', namespaces=NAMESPACES)
100
for elem in selector(body[0]):
103
elem.set('pb_before', '1')
104
page_breaks.add(elem)
106
for i, elem in enumerate(item.data.iter()):
108
elem.set('pb_order', str(i))
109
except TypeError: # Cant set attributes on comment nodes etc.
112
page_breaks = list(page_breaks)
113
page_breaks.sort(cmp=
114
lambda x,y : cmp(int(x.get('pb_order')), int(y.get('pb_order'))))
115
page_break_ids, page_breaks_ = [], []
116
for i, x in enumerate(page_breaks):
117
x.set('id', x.get('id', 'calibre_pb_%d'%i))
119
page_breaks_.append((XPath('//*[@id="%s"]'%id),
120
x.get('pb_before', False)))
121
page_break_ids.append(id)
123
for elem in item.data.iter():
124
elem.attrib.pop('pb_order', False)
125
if elem.get('pb_before', False):
126
elem.attrib.pop('pb_before')
128
return page_breaks_, page_break_ids
132
Fix references to the split files in other content files.
134
for item in self.oeb.manifest:
135
if etree.iselement(item.data):
136
self.current_item = item
137
rewrite_links(item.data, self.rewrite_links)
139
def rewrite_links(self, url):
140
href, frag = urldefrag(url)
141
href = self.current_item.abshref(href)
143
anchor_map = self.map[href]
144
nhref = anchor_map[frag if frag else None]
145
nhref = self.current_item.relhref(nhref)
147
nhref = '#'.join((urlunquote(nhref), frag))
154
class FlowSplitter(object):
155
'The actual splitting logic'
157
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
163
self.page_breaks = page_breaks
164
self.page_break_ids = page_break_ids
165
self.max_flow_size = max_flow_size
166
self.base = item.href
169
base, ext = os.path.splitext(self.base)
170
self.base = base.replace('%', '%%')+'_split_%d'+ext
172
self.trees = [self.item.data.getroottree()]
173
self.splitting_on_page_breaks = True
175
self.split_on_page_breaks(self.trees[0])
176
self.splitting_on_page_breaks = False
178
if self.max_flow_size > 0:
180
self.log('\tLooking for large trees in %s...'%item.href)
181
trees = list(self.trees)
183
for i, tree in enumerate(trees):
184
size = len(tostring(tree.getroot()))
185
if size > self.max_flow_size:
186
self.log('\tFound large tree #%d'%i)
188
self.split_trees = []
189
self.split_to_size(tree)
190
self.tree_map[tree] = self.split_trees
192
self.log('\tNo large trees found')
195
self.trees.extend(self.tree_map.get(x, [x]))
197
self.was_split = len(self.trees) > 1
199
self.log('\tSplit into %d parts'%len(self.trees))
202
def split_on_page_breaks(self, orig_tree):
204
for elem in orig_tree.xpath('//*[@id]'):
206
if id in self.page_break_ids:
207
ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
211
for pattern, before in ordered_ids:
214
self.log.debug('\t\tSplitting on page-break')
215
before, after = self.do_split(tree, elem[0], before)
216
self.trees.append(before)
218
self.trees.append(tree)
219
trees, ids = [], set([])
220
for tree in self.trees:
221
root = tree.getroot()
222
if self.is_page_empty(root):
223
discarded_ids = root.xpath('//*[@id]')
224
for x in discarded_ids:
226
if not x.startswith('calibre_'):
230
body = self.get_body(root)
233
body.insert(0, body.makeelement(XHTML('div'),
234
id=x, style='height:0pt'))
239
def get_body(self, root):
240
body = root.xpath('//h:body', namespaces=NAMESPACES)
245
def adjust_split_point(self, root, path):
247
Move the split point up its ancestor chain if it has no textual content
248
before it. This handles the common case:
249
<div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
252
sp = root.xpath(path)[0]
254
parent = sp.getparent()
255
if barename(parent.tag) in ('body', 'html'):
257
if parent.text and parent.text.strip():
259
if parent.index(sp) > 0:
263
npath = sp.getroottree().getpath(sp)
265
if self.opts.verbose > 3 and npath != path:
266
self.log.debug('\t\t\tMoved split point %s to %s'%(path, npath))
273
def do_split(self, tree, split_point, before):
275
Split ``tree`` into a *before* and *after* tree at ``split_point``,
276
preserving tag structure, but not duplicating any text.
277
All tags that have had their text and tail
278
removed have the attribute ``calibre_split`` set to 1.
280
:param before: If True tree is split before split_point, otherwise after split_point
281
:return: before_tree, after_tree
283
path = tree.getpath(split_point)
284
tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree)
285
root = tree.getroot()
286
root2 = tree2.getroot()
287
body, body2 = map(self.get_body, (root, root2))
288
path = self.adjust_split_point(root, path)
289
split_point = root.xpath(path)[0]
290
split_point2 = root2.xpath(path)[0]
293
def nix_element(elem, top=True):
294
parent = elem.getparent()
295
index = parent.index(elem)
299
index = parent.index(elem)
300
parent[index:index+1] = list(elem.iterchildren())
303
hit_split_point = False
304
for elem in list(body.iterdescendants()):
305
if elem is split_point:
306
hit_split_point = True
316
hit_split_point = False
317
for elem in list(body2.iterdescendants()):
318
if elem is split_point2:
319
hit_split_point = True
321
nix_element(elem, top=False)
323
if not hit_split_point:
324
nix_element(elem, top=False)
329
def is_page_empty(self, root):
330
body = self.get_body(root)
333
txt = re.sub(r'\s+', '',
334
etree.tostring(body, method='text', encoding=unicode))
337
for img in root.xpath('//h:img', namespaces=NAMESPACES):
338
if img.get('style', '') != 'display:none':
342
def split_text(self, text, root, size):
343
self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
344
rest = text.replace('\r', '')
345
parts = re.split('\n\n', rest)
346
self.log.debug('\t\t\t\tFound %d parts'%len(parts))
347
if max(map(len, parts)) > size:
348
raise SplitError('Cannot split as file contains a <pre> tag '
349
'with a very large paragraph', root)
353
if len(buf) + len(part) < size:
361
def split_to_size(self, tree):
362
self.log.debug('\t\tSplitting...')
363
root = tree.getroot()
364
# Split large <pre> tags
365
for pre in list(root.xpath('//pre')):
366
text = u''.join(pre.xpath('descendant::text()'))
368
for child in list(pre.iterchildren()):
370
if len(pre.text) > self.max_flow_size*0.5:
371
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
374
pre2 = copy.copy(pre)
377
new_pres.append(pre2)
378
new_pres[-1].tail = pre.tail
383
split_point, before = self.find_split_point(root)
384
if split_point is None:
385
raise SplitError(self.item.href, root)
387
for t in self.do_split(tree, split_point, before):
389
if self.is_page_empty(r):
391
size = len(tostring(r))
392
if size <= self.max_flow_size:
393
self.split_trees.append(t)
395
'\t\t\tCommitted sub-tree #%d (%d KB)'%(
396
len(self.split_trees), size/1024.))
398
self.split_to_size(t)
400
def find_split_point(self, root):
402
Find the tag at which to split the tree rooted at `root`.
412
We try to split in the "middle" of the file (as defined by tag counts.
414
def pick_elem(elems):
416
elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') !=
419
i = int(math.floor(len(elems)/2.))
420
elems[i].set(SPLIT_POINT_ATTR, '1')
424
'//*[re:match(name(), "h[1-6]", "i")]',
425
'/h:html/h:body/h:div',
433
elems = root.xpath(path, namespaces=NAMESPACES)
434
elem = pick_elem(elems)
437
XPath(elem.getroottree().getpath(elem))
446
Commit all changes caused by the split. This removes the previously
447
introduced ``calibre_split`` attribute and calculates an *anchor_map* for
448
all anchors in the original tree. Internal links are re-directed. The
449
original file is deleted and the split files are saved.
451
if not self.was_split:
453
self.anchor_map = collections.defaultdict(lambda :self.base%0)
456
for i, tree in enumerate(self.trees):
457
root = tree.getroot()
458
self.files.append(self.base%i)
459
for elem in root.xpath('//*[@id or @name]'):
460
anchor = elem.get('id', '')
462
anchor = elem.get('name')
463
self.anchor_map[anchor] = self.files[-1]
464
for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR):
465
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
467
spine_pos = self.item.spine_position
469
for current, tree in zip(*map(reversed, (self.files, self.trees))):
470
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
471
href = a.get('href').strip()
472
if href.startswith('#'):
474
file = self.anchor_map[anchor]
475
file = self.item.relhref(file)
477
a.set('href', file+href)
479
new_id = self.oeb.manifest.generate(id=self.item.id)[0]
480
new_item = self.oeb.manifest.add(new_id, current,
481
self.item.media_type, data=tree.getroot())
482
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
485
for ref in self.oeb.guide.values():
486
href, frag = urldefrag(ref.href)
487
if href == self.item.href:
488
nhref = self.anchor_map[frag if frag else None]
490
nhref = '#'.join((nhref, frag))
493
def fix_toc_entry(toc):
495
href, frag = urldefrag(toc.href)
496
if href == self.item.href:
497
nhref = self.anchor_map[frag if frag else None]
499
nhref = '#'.join((nhref, frag))
506
fix_toc_entry(self.oeb.toc)
508
self.oeb.manifest.remove(self.item)