2
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
4
Convert PDF to a reflowable format using pdftoxml.exe as the PDF parsing backend.
7
import sys, os, re, tempfile, subprocess, atexit, shutil, logging, xml.parsers.expat
8
from xml.etree.ElementTree import parse
10
from calibre import isosx, setup_cli_handlers, __appname__
11
from calibre.utils.config import OptionParser
12
from calibre.ebooks import ConversionError
14
PDFTOXML = 'pdftoxml.exe'
15
if isosx and hasattr(sys, 'frameworks_dir'):
16
PDFTOXML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOXML)
18
class StyleContainer(object):
20
def set_style(self, iterator):
23
if hasattr(tok, 'style'):
25
counts = [0*i for i in range(len(styles))]
26
for i in range(len(styles)):
27
counts[i] = sum([1 for j in self if j.style == styles[i]])
29
for i in range(len(counts)):
32
self.style = styles[i]
34
if obj.style == self.style:
40
def __init__(self, attrs):
41
for a in ('number', 'width', 'height'):
42
setattr(self, a, float(attrs[a]))
44
self.current_line = None
48
if self.current_line is not None:
49
self.current_line.finalize()
50
self.lines.append(self.current_line)
51
self.current_line = None
54
self.identify_groups()
55
self.look_for_page_break()
57
def identify_groups(self):
60
for i in range(len(self.lines)):
67
if cl.left != pl.left and cl.width != pl.width:
70
for i in range(len(groups)):
72
if i +1 == len(groups):
73
stop = len(self.lines)
76
self.groups.append(self.lines[start:stop])
78
if len(self.groups) > 1:
79
self.group[0].test_header(self.width, self.height)
80
self.groups[-1].test_footer(self.width, self.height)
82
def look_for_page_break(self):
85
if not g.is_footer and g.bottom > max:
87
self.page_break_after = max < 0.8*self.height
90
class Group(StyleContainer):
92
def __init__(self, lines):
94
self.set_style(self.lines)
95
self.width = max([i.width for i in self.lines])
96
self.bottom = max([i.bottom for i in self.lines])
98
for i in range(1, len(self.lines)):
99
bot = self.lines[i-1].bottom
100
top = self.lines[i].top
101
tot += abs(top - bot)
102
ltot += self.lines[i].left
103
self.average_line_spacing = tot/float(len(self.lines)-1)
104
ltot += self.lines[0].left
105
self.average_left_margin = ltot/float(len(self.lines))
106
self.left_margin = min([i.left for i in self.lines])
108
self.detect_paragraphs()
112
def detect_paragraphs(self):
116
self.lines[0].is_para_start = self.lines[0].left > self.average_left_margin+indent_buffer
117
for i in range(1, len(self.lines)):
118
pl, l = self.lines[i-1:i+1]
119
c1 = pl.bottom - l.top > self.average_line_spacing
120
c2 = l.left > self.average_left_margin+indent_buffer
121
c3 = pl.width < 0.8 * self.width
122
l.is_para_start = c1 or c2 or c3
124
def test_header(self, page_width, page_height):
125
self.is_header = len(self.lines) == 1 and self.lines[0].width < 0.5*page_width
127
def test_footer(self, page_width, page_height):
128
self.is_footer = len(self.lines) == 1 and self.lines[0].width < 0.5*page_width
132
def __init__(self, attrs):
133
for a in ('x', 'y', 'width', 'height'):
134
setattr(self, a, float(attrs[a]))
135
self.id = attrs['id']
138
def add_token(self, tok):
140
self.objects.append(tok)
142
ptok = self.objects[-1]
144
ptok.text += ' ' + tok.text
146
self.objects.append(tok)
148
def add(self, object):
149
if isinstance(object, Token):
150
self.add_token(object)
152
print 'WARNING: Unhandled object', object.__class__.__name__
156
for obj in self.objects:
157
if isinstance(obj, Token):
158
res.append(obj.to_xhtml())
162
class Line(list, StyleContainer):
164
def calculate_geometry(self):
165
self.left = self[0].x
166
self.width = self[-1].x + self[-1].width - self.left
167
self.top = min(o.y for o in self)
168
self.bottom = max(o.height+o.y for o in self)
171
self.calculate_geometry()
174
def to_xhtml(self, group_id):
175
ans = '<span class="%s" '%group_id
176
if self.style is not None:
177
ans += 'style="%s"'%self.style.to_css(inline=True)
181
if isinstance(object, Text):
182
res.append(object.to_xhtml())
184
return ans%(' '.join(res))
187
class TextStyle(object):
189
def __init__(self, tok):
191
self.italic = tok.italic
192
self.font_name = tok.font_name
193
self.font_size = tok.font_size
194
self.color = tok.font_color
196
def __eq__(self, other):
197
if isinstance(other, self.__class__):
198
for a in ('font_size', 'bold', 'italic', 'font_name', 'color'):
199
if getattr(self, a) != getattr(other, a):
204
def to_css(self, inline=False):
205
fw = 'bold' if self.bold else 'normal'
206
fs = 'italic' if self.italic else 'normal'
207
fsz = '%dpt'%self.font_size
208
props = ['font-weight: %s;'%fw, 'font-style: %s;'%fs, 'font-size: %s;'%fsz,
209
'color: rgb(%d, %d, %d);'%self.color]
213
props = ['{'] + props + ['}']
214
return joiner.join(props)
218
def __init__(self, attrs):
219
for a in ('x', 'y', 'width', 'height', 'rotation', 'angle', 'font-size'):
220
setattr(self, a.replace('-', '_'), float(attrs[a]))
221
for a in ('bold', 'italic'):
222
setattr(self, a, attrs[a]=='yes')
223
self.font_name = attrs['font-name']
224
fc = re.compile(r'#([a-f0-9]{2})([a-f0-9]{2})([a-f0-9]{2})', re.IGNORECASE)
225
fc = fc.match(attrs['font-color'])
226
self.font_color = (int(fc.group(1), 16), int(fc.group(2), 16), int(fc.group(3), 16))
227
self.id = attrs['id']
229
self.style = TextStyle(self)
231
def handle_char_data(self, data):
234
def __eq__(self, other):
235
if isinstance(other, self.__class__):
236
for a in ('rotation', 'angle', 'font_size', 'bold', 'italic', 'font_name', 'font_color'):
237
if getattr(self, a) != getattr(other, a):
243
if self.style is not None:
244
ans = u'<span style="%s">%s</span>'%(self.style.to_css(inline=True), self.text)
249
class PDFDocument(object):
251
SKIPPED_TAGS = ('DOCUMENT', 'METADATA', 'PDFFILENAME', 'PROCESS', 'VERSION',
252
'COMMENT', 'CREATIONDATE')
254
def __init__(self, filename):
255
parser = xml.parsers.expat.ParserCreate('UTF-8')
256
parser.buffer_text = True
257
parser.returns_unicode = True
258
parser.StartElementHandler = self.start_element
259
parser.EndElementHandler = self.end_element
262
self.current_page = None
263
self.current_token = None
265
src = open(filename, 'rb').read()
270
def start_element(self, name, attrs):
272
self.current_token = Token(attrs)
273
self.parser.CharacterDataHandler = self.current_token.handle_char_data
276
if self.current_page.current_line is None:
277
self.current_page.current_line = Line()
278
self.current_page.current_line.append(text)
280
y, height = self.current_page.current_line[0].y, self.current_page.current_line[0].height
281
if y == text.y or y+height == text.y + text.height:
282
self.current_page.current_line.append(text)
284
self.current_page.end_line()
285
self.current_page.current_line = Line()
286
self.current_page.current_line.append(text)
288
self.current_page = Page(attrs)
289
elif name.lower() == 'xi:include':
290
print 'WARNING: Skipping vector image'
291
elif name in self.SKIPPED_TAGS:
294
print 'WARNING: Unhandled element', name
296
def end_element(self, name):
298
if self.current_token.angle == 0 and self.current_token.rotation == 0:
299
self.current_page.current_line[-1].add(self.current_token)
300
self.current_token = None
301
self.parser.CharacterDataHandler = None
303
self.current_page.finalize()
304
self.pages.append(self.current_page)
305
self.current_page = None
310
<?xml version="1.0" encoding="UTF-8"?>
311
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
312
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
313
<html xmlns="http://www.w3.org/1999/xhtml"
314
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
315
xsi:schemaLocation="http://www.w3.org/MarkUp/SCHEMA/xhtml11.xsd" >
317
<style type="text/css">
329
for page in self.pages:
330
res.append(u'<a name="%s" />'%page.id)
331
for group in page.groups:
332
if group.is_header or group.is_footer:
334
if group.style is not None:
335
styles.append(u'.%s %s\n'%(group.id, group.style.to_css()))
336
for line in group.lines:
337
if line.is_para_start:
338
indent = group.left_margin - line.left
340
res.append(u'<p style="text-indent: %dpt">%s</p>'%(indent, ''.join(para)))
342
para.append(line.to_xhtml(group.id))
343
if page.page_break_after:
344
res.append(u'<br style="page-break-after:always" />')
346
res.append(u'<p>%s</p>'%(''.join(para)))
349
return (header%dict(style='\n'.join(styles), body='\n'.join(res))).encode('utf-8')
351
class PDFConverter(object):
354
def generate_xml(cls, pathtopdf, logger):
355
pathtopdf = os.path.abspath(pathtopdf)
356
tdir = tempfile.mkdtemp('pdf2xml', __appname__)
357
atexit.register(shutil.rmtree, tdir)
358
xmlfile = os.path.basename(pathtopdf)+'.xml'
360
cmd = PDFTOXML + ' -outline "%s" "%s"'%(pathtopdf, xmlfile)
361
p = subprocess.Popen(cmd, shell=True, stderr=subprocess.STDOUT,
362
stdout=subprocess.PIPE)
363
log = p.stdout.read()
366
raise ConversionError, log
367
xmlfile = os.path.join(tdir, xmlfile)
368
if os.stat(xmlfile).st_size < 20:
369
raise ConversionError(os.path.basename(pathtopdf) + ' does not allow copying of text.')
373
def __init__(self, pathtopdf, logger, opts):
374
self.cwd = os.getcwdu()
378
self.logger.info('Converting PDF to XML')
379
self.xmlfile = self.generate_xml(pathtopdf, self.logger)
380
self.tdir = os.path.dirname(self.xmlfile)
381
self.data_dir = self.xmlfile + '_data'
382
outline_file = self.xmlfile.rpartition('.')[0]+'_outline.xml'
383
self.logger.info('Parsing XML')
384
self.document = PDFDocument(self.xmlfile)
385
self.outline = parse(outline_file)
389
def convert(self, output_dir):
390
doc = self.document.to_xhtml()
391
open(os.path.join(output_dir, 'document.html'), 'wb').write(doc)
396
parser = OptionParser(usage=\
398
%prog [options] myfile.pdf
400
Convert a PDF file to a HTML file.
402
parser.add_option('-o', '--output-dir', default='.',
403
help=_('Path to output directory in which to create the HTML file. Defaults to current directory.'))
404
parser.add_option('--verbose', default=False, action='store_true',
405
help=_('Be more verbose.'))
408
def main(args=sys.argv, logger=None):
409
parser = option_parser()
410
options, args = parser.parse_args()
412
level = logging.DEBUG if options.verbose else logging.INFO
413
logger = logging.getLogger('pdf2html')
414
setup_cli_handlers(logger, level)
417
print _('You must specify a single PDF file.')
419
options.output_dir = os.path.abspath(options.output_dir)
420
converter = PDFConverter(os.path.abspath(args[0]), logger, options)
421
converter.convert(options.output_dir)
425
if __name__ == '__main__':
b'\\ No newline at end of file'