1
# -*- coding: utf-8 -*-
2
from __future__ import with_statement
5
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
6
__docformat__ = 'restructuredtext en'
9
Split PDF file into multiple PDF documents.
13
from optparse import OptionGroup, Option
15
from calibre.ebooks.metadata.meta import metadata_from_formats
16
from calibre.ebooks.metadata import authors_to_string
17
from calibre.utils.config import OptionParser
18
from calibre.utils.logging import Log
19
from calibre.constants import preferred_encoding
20
from calibre.customize.conversion import OptionRecommendation
21
from calibre.ebooks.pdf.verify import is_valid_pdf, is_encrypted
23
from pyPdf import PdfFileWriter, PdfFileReader
26
%prog %%name [options] file.pdf page_to_split_on ...
27
%prog %%name [options] file.pdf page_range_to_split_on ...
31
%prog %%name file.pdf 6
32
%prog %%name file.pdf 6-12
33
%prog %%name file.pdf 6-12 8 10 9-20
39
OptionRecommendation(name='output', recommended_value='split.pdf',
40
level=OptionRecommendation.HIGH, long_switch='output', short_switch='o',
41
help=_('Path to output file. By default a file is created in the current directory.')),
44
def print_help(parser, log):
45
help = parser.format_help().encode(preferred_encoding, 'replace')
48
def option_parser(name):
49
usage = USAGE.replace('%%name', name)
50
return OptionParser(usage=usage)
52
def option_recommendation_to_cli_option(add_option, rec):
54
switches = ['-'+opt.short_switch] if opt.short_switch else []
55
switches.append('--'+opt.long_switch)
56
attrs = dict(dest=opt.name, help=opt.help,
57
choices=opt.choices, default=rec.recommended_value)
58
add_option(Option(*switches, **attrs))
60
def add_options(parser):
61
group = OptionGroup(parser, _('Split Options:'), _('Options to control the transformation of pdf'))
62
parser.add_option_group(group)
63
add_option = group.add_option
66
option_recommendation_to_cli_option(add_option, rec)
68
def split_pdf(in_path, pages, page_ranges, out_name, metadata=None):
69
pdf = PdfFileReader(open(os.path.abspath(in_path), 'rb'))
70
total_pages = pdf.numPages - 1
72
for index in pages+page_ranges:
74
write_pdf(pdf, out_name, '%s' % (index + 1), index, total_pages, metadata)
77
write_pdf(pdf, out_name, '%s-%s' % (index[0] + 1, index[1] + 1), index[0], index[1], metadata)
79
def write_pdf(pdf, name, suffix, start, end, metadata=None):
84
title = metadata.title
85
author = authors_to_string(metadata.authors)
87
out_pdf = PdfFileWriter(title=title, author=author)
88
for page_num in range(start, end + 1):
89
out_pdf.addPage(pdf.getPage(page_num))
90
with open('%s%s.pdf' % (name, suffix), 'wb') as out_file:
91
out_pdf.write(out_file)
102
if re.search('(?iu)^.*?\.pdf[ ]*$', arg) != None:
107
# Find single indexes
108
elif re.search('^[ ]*\d+[ ]*$', arg) != None:
111
elif re.search('^[ ]*\d+[ ]*-[ ]*\d+[ ]*$', arg) != None:
112
mo = re.search('^[ ]*(?P<start>\d+)[ ]*-[ ]*(?P<end>\d+)[ ]*$', arg)
113
start = mo.group('start')
114
end = mo.group('end')
116
# check to see if the range is really a single index
120
page_ranges.append([start, end])
124
bad = sorted(list(set(bad)))
126
return pdf, pages, page_ranges, bad
128
# Remove duplicates from pages and page_ranges.
129
# Set pages higher than the total number of pages in the pdf to the last page.
130
# Return pages and page_ranges as lists of ints.
131
def clean_page_list(pdf_path, pages, page_ranges):
132
pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb'))
134
total_pages = pdf.numPages
140
if index > total_pages:
141
sorted_pages.append(total_pages - 1)
143
sorted_pages.append(index - 1)
145
for start, end in page_ranges:
149
if start > total_pages and end > total_pages:
150
sorted_pages.append(total_pages - 1)
153
if start > total_pages:
155
if end > total_pages:
157
page_range = sorted([start - 1, end - 1])
158
if page_range not in sorted_ranges:
159
sorted_ranges.append(page_range)
161
# Remove duplicates and sort
162
pages = sorted(list(set(sorted_pages)))
163
page_ranges = sorted(sorted_ranges)
165
return pages, page_ranges
167
def main(args=sys.argv, name=''):
169
parser = option_parser(name)
172
opts, args = parser.parse_args(args)
174
pdf, pages, page_ranges, unknown = split_args(args[1:])
176
if pdf == '' and (pages == [] or page_ranges == []):
177
print 'Error: PDF and where to split is required.\n'
178
print_help(parser, log)
183
print 'Error: Unknown argument `%s`' % arg
184
print_help(parser, log)
187
if not is_valid_pdf(pdf):
188
print 'Error: Could not read file `%s`.' % pdf
191
if is_encrypted(pdf):
192
print 'Error: file `%s` is encrypted.' % args[0]
195
pages, page_ranges = clean_page_list(pdf, pages, page_ranges)
197
mi = metadata_from_formats([pdf])
199
split_pdf(pdf, pages, page_ranges, os.path.splitext(opts.output)[0], mi)
203
if __name__ == '__main__':