2
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
5
import sys, os, subprocess, logging
7
from functools import partial
8
from calibre import isosx, setup_cli_handlers, filename_to_utf8, iswindows, islinux
9
from calibre.ebooks import ConversionError, DRMError
10
from calibre.ptempfile import PersistentTemporaryDirectory
11
from calibre.ebooks.lrf import option_parser as lrf_option_parser
12
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
13
from calibre.ebooks.metadata import MetaInformation
14
from calibre.ebooks.metadata.opf import OPFCreator
15
from calibre.ebooks.metadata.pdf import get_metadata
17
PDFTOHTML = 'pdftohtml'
18
popen = subprocess.Popen
19
if isosx and hasattr(sys, 'frameworks_dir'):
20
PDFTOHTML = os.path.join(getattr(sys, 'frameworks_dir'), PDFTOHTML)
21
if iswindows and hasattr(sys, 'frozen'):
22
PDFTOHTML = os.path.join(os.path.dirname(sys.executable), 'pdftohtml.exe')
23
popen = partial(subprocess.Popen, creationflags=0x08) # CREATE_NO_WINDOW=0x08 so that no ugly console is popped up
24
if islinux and getattr(sys, 'frozen_path', False):
25
PDFTOHTML = os.path.join(getattr(sys, 'frozen_path'), 'pdftohtml')
27
def generate_html(pathtopdf, tdir):
29
Convert the pdf into html.
30
@return: Path to a temporary file containing the HTML.
32
if isinstance(pathtopdf, unicode):
33
pathtopdf = pathtopdf.encode(sys.getfilesystemencoding())
34
if not os.access(pathtopdf, os.R_OK):
35
raise ConversionError, 'Cannot read from ' + pathtopdf
36
index = os.path.join(tdir, 'index.html')
37
# This is neccessary as pdftohtml doesn't always (linux) respect absolute paths
38
pathtopdf = os.path.abspath(pathtopdf)
39
cmd = (PDFTOHTML, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
40
'-nodrm', pathtopdf, os.path.basename(index))
46
p = popen(cmd, stderr=subprocess.PIPE)
49
raise ConversionError(_('Could not find pdftohtml, check it is in your PATH'), True)
61
if e.errno == errno.EINTR:
68
raise ConversionError, err
69
if not os.path.exists(index) or os.stat(index).st_size < 100:
72
raw = open(index, 'rb').read()
73
open(index, 'wb').write('<!-- created by calibre\'s pdftohtml -->\n'+raw)
74
if not '<br' in raw[:4000]:
75
raise ConversionError(os.path.basename(pathtopdf) + _(' is an image based PDF. Only conversion of text based PDFs is supported.'), True)
77
mi = get_metadata(open(pathtopdf, 'rb'))
79
mi = MetaInformation(None, None)
81
mi.title = os.path.splitext(os.path.basename(pathtopdf))[0]
83
mi.authors = [_('Unknown')]
84
opf = OPFCreator(tdir, mi)
85
opf.create_manifest([('index.html', None)])
86
opf.create_spine(['index.html'])
87
opf.render(open('metadata.opf', 'wb'))
93
return lrf_option_parser(
94
_('''%prog [options] mybook.pdf
97
%prog converts mybook.pdf to mybook.lrf''')
100
def process_file(path, options, logger=None):
102
level = logging.DEBUG if options.verbose else logging.INFO
103
logger = logging.getLogger('pdf2lrf')
104
setup_cli_handlers(logger, level)
105
pdf = os.path.abspath(os.path.expanduser(path))
106
tdir = PersistentTemporaryDirectory('_pdf2lrf')
107
htmlfile = generate_html(pdf, tdir)
108
if not options.output:
109
ext = '.lrs' if options.lrs else '.lrf'
110
options.output = os.path.abspath(os.path.basename(os.path.splitext(path)[0]) + ext)
112
options.output = os.path.abspath(options.output)
113
options.pdftohtml = True
114
if not options.title:
115
options.title = filename_to_utf8(os.path.splitext(os.path.basename(options.output))[0])
116
html_process_file(htmlfile, options, logger)
119
def main(args=sys.argv, logger=None):
120
parser = option_parser()
121
options, args = parser.parse_args(args)
125
print 'No pdf file specified'
127
process_file(args[1], options, logger)
130
if __name__ == '__main__':