2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
5
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
6
__docformat__ = 'restructuredtext en'
11
from calibre import entity_to_unicode
13
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
14
SVG_NS = 'http://www.w3.org/2000/svg'
15
XLINK_NS = 'http://www.w3.org/1999/xlink'
17
convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
18
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
21
def sanitize_head(match):
23
x = _span_pat.sub('', x)
24
return '<head>\n%s\n</head>' % x
27
chap = match.group('chap')
28
title = match.group('title')
30
return '<h1>'+chap+'</h1><br/>\n'
32
return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
34
def wrap_lines(match):
35
ital = match.group('ital')
41
def line_length(raw, percent):
43
raw is the raw text to find the line length to use for wrapping.
44
percentage is a decimal number, 0 - 1 which is used to determine
45
how far in the list of line lengths to use. The list of line lengths is
46
ordered smallest to larged and does not include duplicates. 0.5 is the
49
raw = raw.replace(' ', ' ')
50
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
51
lines = linere.findall(raw)
56
lengths.append(len(line))
61
lengths = list(set(lengths))
63
avg = total / len(lengths)
66
lengths = sorted(lengths)
67
for i in range(len(lengths) - 1, -1, -1):
68
if lengths[i] > max_line:
76
index = int(len(lengths) * percent) - 1
81
class CSSPreProcessor(object):
83
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
85
def __call__(self, data):
86
data = self.PAGE_PAT.sub('', data)
89
class HTMLPreProcessor(object):
92
# Some idiotic HTML generators (Frontpage I'm looking at you)
93
# Put all sorts of crap into <head>. This messes up lxml
94
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
96
# Convert all entities, since lxml doesn't handle them well
97
(re.compile(r'&(\S+?);'), convert_entities),
98
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
99
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
103
# Fix pdftohtml markup
106
(re.compile(u'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ö'),
107
(re.compile(u'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ö'),
108
(re.compile(u'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ü'),
109
(re.compile(u'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ü'),
110
(re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
111
(re.compile(u'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ë'),
112
(re.compile(u'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ï'),
113
(re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
114
(re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
115
(re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
118
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
120
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
121
# Replace <br><br> with <p>
122
(re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
125
(re.compile(r'-<br.*?>\n\r?'), lambda match: ''),
127
# Remove gray background
128
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
130
# Remove non breaking spaces
131
(re.compile(ur'\u00a0'), lambda match : ' '),
133
# Detect Chapters to match default XPATH in GUI
134
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<i><b>|<i>|<b>)?(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?(</i></b>|</i>|</b>)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
135
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
137
# Have paragraphs show better
138
(re.compile(r'<br.*?>'), lambda match : '<p>'),
140
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
141
# Connect paragraphs split by -
142
(re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
143
# Add space before and after italics
144
(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
145
(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
148
# Fix Book Designer markup
151
(re.compile('<hr>', re.IGNORECASE),
152
lambda match : '<span style="page-break-after:always"> </span>'),
154
(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
155
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
156
(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
157
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
158
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
159
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
160
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
161
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
163
def __init__(self, input_plugin_preprocess, plugin_preprocess,
165
self.input_plugin_preprocess = input_plugin_preprocess
166
self.plugin_preprocess = plugin_preprocess
167
self.extra_opts = extra_opts
169
def is_baen(self, src):
170
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
171
re.IGNORECASE).search(src) is not None
173
def is_book_designer(self, raw):
174
return re.search('<H2[^><]*id=BookTitle', raw) is not None
176
def is_pdftohtml(self, src):
177
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
179
def __call__(self, html, remove_special_chars=None):
180
if remove_special_chars is not None:
181
html = remove_special_chars.sub('', html)
182
html = html.replace('\0', '')
183
if self.is_baen(html):
185
elif self.is_book_designer(html):
186
rules = self.BOOK_DESIGNER
187
elif self.is_pdftohtml(html):
189
if getattr(self.extra_opts, 'unwrap_factor', None):
190
length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
193
# Un wrap using punctuation
194
(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
197
rules = self.PDFTOHTML + end_rules
202
if getattr(self.extra_opts, 'remove_header', None):
204
(re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
206
if getattr(self.extra_opts, 'remove_footer', None):
208
(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
211
for rule in self.PREPROCESS + pre_rules + rules:
212
html = rule[0].sub(rule[1], html)
214
# Handle broken XHTML w/ SVG (ugh)
215
if 'svg:' in html and SVG_NS not in html:
217
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
218
if 'xlink:' in html and XLINK_NS not in html:
220
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
222
html = XMLDECL_RE.sub('', html)
224
if getattr(self.extra_opts, 'asciiize', False):
225
from calibre.ebooks.unidecode.unidecoder import Unidecoder
226
unidecoder = Unidecoder()
227
html = unidecoder.decode(html)
229
if self.plugin_preprocess:
230
html = self.input_plugin_preprocess(html)