~ubuntu-branches/ubuntu/karmic/calibre/karmic

(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),

136

137

# Have paragraphs show better

138

(re.compile(r'<br.*?>'), lambda match : '<p>'),

139

# Clean up spaces

140

(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),

141

# Connect paragraphs split by -

142

(re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),

143

# Add space before and after italics

144

(re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),

145

(re.compile(r'</i>(?=\w)'), lambda match: '</i> '),

146

]

147

148

# Fix Book Designer markup

149

BOOK_DESIGNER = [

150

# HR

151

(re.compile('<hr>', re.IGNORECASE),

152

lambda match : '<span style="page-break-after:always"> </span>'),

153

# Create header tags

154

(re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),

155

lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),

156

(re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),

157

lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),

158

(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),

159

lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),

160

(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),

161

lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),

162

]

163

def __init__(self, input_plugin_preprocess, plugin_preprocess,

164

extra_opts=None):

165

self.input_plugin_preprocess = input_plugin_preprocess

166

self.plugin_preprocess = plugin_preprocess

167

self.extra_opts = extra_opts

168

169

def is_baen(self, src):

170

return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',

171

re.IGNORECASE).search(src) is not None

172

173

def is_book_designer(self, raw):

174

return re.search('<H2[^><]*id=BookTitle', raw) is not None

175

176

def is_pdftohtml(self, src):

177

return '' in src[:1000]

178

179

def __call__(self, html, remove_special_chars=None):

180

if remove_special_chars is not None:

181

html = remove_special_chars.sub('', html)

182

html = html.replace('\0', '')

183

if self.is_baen(html):

184

rules = []

185

elif self.is_book_designer(html):

186

rules = self.BOOK_DESIGNER

187

elif self.is_pdftohtml(html):

188

end_rules = []

189

if getattr(self.extra_opts, 'unwrap_factor', None):

190

length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))

191

if length:

192

end_rules.append(

193

# Un wrap using punctuation

194

(re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),

195

)

196

197

rules = self.PDFTOHTML + end_rules

198

else:

199

rules = []

200

201

pre_rules = []

202

if getattr(self.extra_opts, 'remove_header', None):

203

pre_rules.append(

204

(re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')

205

)

206

if getattr(self.extra_opts, 'remove_footer', None):

207

pre_rules.append(

208

(re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')

209

)

210

211

for rule in self.PREPROCESS + pre_rules + rules:

212

html = rule[0].sub(rule[1], html)

213

214

# Handle broken XHTML w/ SVG (ugh)

215

if 'svg:' in html and SVG_NS not in html:

216

html = html.replace(

217

'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)

218

if 'xlink:' in html and XLINK_NS not in html:

219

html = html.replace(

220

'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)

221

222

html = XMLDECL_RE.sub('', html)

223

224

if getattr(self.extra_opts, 'asciiize', False):

225

from calibre.ebooks.unidecode.unidecoder import Unidecoder

226

unidecoder = Unidecoder()

227

html = unidecoder.decode(html)

228

229

if self.plugin_preprocess:

230

html = self.input_plugin_preprocess(html)

231

232

return html

233

Older »