1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
|
#!/usr/bin/python
'''
Bkrpr
Convert scanned images of text into flexible text-based documents.
Copyright 2008 James Vasile
Released under GPLv3
See LICENSE, gplv3 or http://www.gnu.org/licenses/gpl-3.0.html
'''
# In debugging, you might want im.show. To make im.show work on Ubuntu:
# 'apt-get install xloadimage; ln -s /usr/bin/xview /usr/bin/xv'
# convert file.pdf page-%03d.pgm to turn multipage pdf into a format we can use
import os
import sys
import getopt
import re
import Bkrpr.config as config
import Bkrpr.cli as cli
import Bkrpr.wxGUI.main as GUI
from Bkrpr.oneline import comm
## TODO
# TODO: Write some routines for actually OCRing the images
# TODO: Write an interface and routines for correcting that OCR
# TODO: Documentation
# TODO: Improve accuracy of line split
# TODO: Write SiSU files after we have good OCR text
# TODO: Rotate should know which are even and which are odd pages by filename
def help_msg():
'''print basic program description'''
print '''Slice image of text into smaller images, each with one line.
'''
def usage():
'''Print the CLI documentation string.'''
print '''
Usage: %s [action] [options] [filespec]
ACTIONS (select one of these)
-r --rotate\tTurn the image so the page is right-side-up
-c --crop\tAutocrop the image to the page edge
-l --lines\tCut into smaller images, each with one line of text
-o --ocr\tOcr the rotated, cropped, line-split text
OPTIONS
--90 g\t\tOverride filespec with a file glob for pages to be rotated 90 degrees clockwise
--180 g\t\tOverride filespec with a file glob for pages to be rotated 180 degrees clockwise
--270 g\t\tOverride filespec with a file glob for pages to be rotated 270 degrees clockwise
--360 g\t\tOverride filespec with a file glob for pages not to be rotated (Overrides filespec)
--brightness n\tBrightness as percentage of current brightness (default is 100)
--continue\t\tDon't overwrite existing rotated/cropped/line-split files
--contrast n\tContrast as percentage of current contrast (default is 100)
--even d\t\tRotate the even-numbered pages d degrees. d = [90|180|270|360] (uninmplemented)
--gui\t\tUse the GUI interface
-h, --help
--level [on|off]\tSet to 'on' to rotate the images level after line-splitting
--odd d\t\tRotate the odd-numbered pages d degrees. d = [90|180|270|360] (unimplemented)
--tilt n\t\tSpecify max tilt of text in degrees from horizon
-v, --verbose
Filespec is a file glob, so it can be one file or a whole dir.
You might have to put the file glob in quotes.
NOTES
--90, --180, --270, --360
Rotating is quite slow (about a minute per page) unless you can
specify which are odd and which are even pages with --90, --180, --270
and --360. If you have pages to be rotated and cropped, but only some
need rotation, specify the ones that do not need rotation with --360.
--even, --odd
You can specify the amount to rotate even- and odd-numbered pages with
--even and --odd. The final digit appearing in a filename determines
the even or oddness. 3rd_edition_book_7_page_34_unabridged.tif is
even, as is 3rd_edition_page_34_book_8_edited.tif
EXAMPLES
To rotate and crop a set of pages where we know which are even and
which are odd and how much to rotate each by. Note that the even
pages don't need rotating, just cropping:
%s.py -cr --180 'odd*.tiff' --360 'even*.tiff'
Note the need to quote the file globs.
''' % (config.bin_name, config.bin_name)
def dev():
'Developer documentation'
print '''DEVELOPMENT
--dev\t\tThis message
--doctest\tRun doctest on this file TODO: expand to all our modules (TODO: write doctests)
'''
def do_parse():
'''Future option parser, not implemented yet.'''
from optparse import OptionParser
parser = OptionParser()
parser.add_option("", "--local", dest="local", default=False,
action="store_true",
help="Use local version of branch instead of grabbing most recent version.")
#parser.add_option("-p", "--program", dest="program", default = program_name,
# help="specify program name. Default is %s" % program_name,
# metavar="PROGRAM")
(options, args) = parser.parse_args()
program_name = options.program
return options
def do_args(argv):
'Check all the commandline arguments. Pass it sys.argv[1:]'
if len(argv) == 0:
comm.out("Starting GUI. Run %s --help for commandline operation."
% (config.bin_name))
config.cli = False
return
try:
opts, args = getopt.getopt(argv, "cdghlrv",
["90=", "180=", "270=", "360="
"brightness=", "continue", "contrast=",
"crop", "dev", "even=", "debug",
"destination=", "doctest", "gui", "help",
"level=", "lines", "odd=", "regex=",
"resume", "rotate","tilt=", "verbose"])
except getopt.GetoptError, err:
print str(err)
print "--help for options"
sys.exit(2)
config.cli = True
ret = {}
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
help_msg()
sys.exit()
elif opt in ("--90"): ret['90'] = arg
elif opt in ("--180"): ret['180'] = arg
elif opt in ("--270"): ret['270'] = arg
elif opt in ("--360"): ret['360'] = arg
elif opt in ( "--brightness"): ret['brightness'] = arg
elif opt in ("-c", "--crop"): ret['crop'] = True
elif opt in ( "--continue"): config.cont = True
elif opt in ( "--contrast"): ret['contrast'] = arg
elif opt in ("-d", "--debug"): config.verbose_level = config.DEBUG
elif opt in ( "--destination"): ret['destination'] = arg
elif opt in ( "--doctest"):
import doctest
doctest.testmod()
sys.exit()
elif opt in ( "--dev"):
dev()
sys.exit()
elif opt in ( "--even"): ret['even'] = arg
elif opt in ( "--gui"): config.cli = False
elif opt in ("-g"): ret['generate_html'] = True
elif opt in ( "--hocr"): ret['hocr'] = arg
elif opt in ("-l", "--lines"): ret['lines'] = True
elif opt in ( "--level"): ret['level'] = arg == 'on'
elif opt in ( "--odd"): ret['odd'] = arg
elif opt in ("-r", "--rotate"): ret['rotate'] = True
elif opt in ( "--regex"): ret['regex'] = arg
elif opt in ( "--resume"): ret['resume'] = True
elif opt in ( "--tilt"): ret['tilt'] = arg
elif opt in ("-v", "--verbose"): config.verbose_level = config.VERBOSE
else:
print "Invalid commandline option: %s %s" % (opt, arg)
print "Incidentally, if you're seeing this, "\
"there's a bug in commandline parsing."
usage()
sys.exit(2)
if not config.cli:
return ret
if not('90' in ret or '180' in ret or '270' in ret or '360' in ret):
if len(args) == 0:
print "Must specify path or image filename."
usage()
sys.exit(2)
ret['path'] = args[0]
if not('crop' in ret or 'rotate' in ret or
'lines' in ret or 'generate_html' in ret):
print "Must specify at least one action: "\
"crop (-c), rotate (-r), lines(-l)."
usage()
sys.exit(2)
return ret
def detect_os():
'''Sets config.linux, config.windows, config.bsd, or
config.macintosh to True.
'''
if sys.platform == "linux2":
config.linux = True
elif sys.platform == 'win32' or sys.platform == 'win64':
config.windows = True
import Bkrpr.winshell as winshell
elif sys.platform == 'darwin':
config.macintosh = True
elif sys.platform == 'freebsd64':
config.bsd = True
def read_config():
'''Read the user's bkrpr config file and override config.py
defaults.'''
base_dir = ''
if config.windows:
base_dir = winshell.application_data()
elif config.linux:
base_dir = os.path.expanduser('~')
config.config_fname = os.path.join(base_dir, config.config_fname)
###############################################################################
def main():
'''Main entry point for this program. Checks command line, starts GUI
or CLI.'''
if os.path.exists('pause'):
os.unlink('pause')
detect_os()
read_config()
opt = do_args(sys.argv[1:])
#opt = do_parse()
#if not config.debug:
try:
import psyco
psyco.full()
except ImportError:
print "psyco not found, things might be slow."
comm.set_verbose(config.verbose())
comm.set_debug(config.debug())
if config.cli:
bkrpr = cli.OcrGlob(opt)
if 'generate_html' in opt:
bkrpr.generate_html()
bkrpr.main(opt)
else:
import wx
app = wx.App(False)
GUI.Bkrpr(None, -1, 'Bkrpr %s' % config.version)
app.MainLoop()
if __name__ == '__main__':
main()
|