~bkrpr/bookliberator/trunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/python

import os
import sys
import Image, ImageDraw, ImageEnhance, ImageChops, ImageFilter, ImageOps

import Bkrpr.config as config
from Bkrpr.oneline import comm
from Bkrpr.misc import do_all_iterator
import Bkrpr.Model as model
import Bkrpr.graphic as graphic
from Bkrpr.misc import *
import Bkrpr.files as files
import Bkrpr.splitter as splitter
import Bkrpr.rotater as rotater
import Bkrpr.cropper as cropper
import Bkrpr.pyocropus as pyocropus

#TODO: a lot of the work done by opt should be done by config.  Switching piecemeal might break CLI, but that's ok for now.

class pre_ocr_file:
    '''unused class for now'''
    def __init__(self, filespec):
        self.filespec = filespec
        self.im = graphic.graphic(filespec)
        self.cuts=[]

    def task(self, task, **opt):
        '''Do one of a series of tasks (burst/rotate/crop/line-split)'''

        dest_file = '%s.%s.png' % (files.basename(self.filespec), task)
        if config.cont and os.access(dest_file, os.F_OK):
            self.im = graphic.graphic(dest_file)
            comm.verbose("... already done.")
        else:
            comm.verbose("Applying %s to %s" % (task, self.filespec))

            if task == 'burst':
                comm.out("burst is not implemented yet.  Might never be.")
                #convert -quality 100 -density 600x600 multipage.pdf pg%d.tif
                ## Can we really do this here?  Does it make sense?
                pass
            elif task == 'lines':
                stub = files.basename(self.filespec)
                hocr = '%s.hocr.html' % stub
                self.im.save('%s.temp.png' % stub)
                syscall('ocroscript rec-tess %s.temp.png > %s' % (stub, hocr))

                cuts = splitter.splitter(self.im, hocr = hocr, **opt).split()
                if cuts == False:
                    return False
                line = 1
                self.cuts.append(cuts)
                for c in cuts:
                    c.save('%s.L%d.png' % (stub, line))
                    line = line + 1
            return True

    def process(self, **opt):
        return self.im # might be returning false if the processing failed

class pre_ocr_glob:
    '''unused class for now'''
    def __init__(self, glob):
        self.path = glob
        self.files = find_files_glob(glob)

    def process(self, **opt):
        return fail

class crop_data:
    def __init__(self, fname, dest, crop):
        self.original_fname = fname
        self.dest_fname = dest
        self.left = crop[0]
        self.top = crop[1]
        self.right = crop[2]
        self.bottom = crop[3]

    def __getitem__(self, item):
        print item
        sys.exit()

class OcrGlob:
    def __init__(self, opt):
        'Gather all the filenames'

        self.M = model.Model()

        ## Put files into the data model
        for p in ['path', '90', '180', '270', '360']:
            if p in opt:
                self.M.rotate.add_group(p)
                self.M.crop.add_group(p)
                self.M.files[p] = []
                for f in find_files_glob(opt[p]):
                    self.M.files[p].append(f)

        self.M.pagify_files(self.M.files)

        for g in self.M.files:
            if g != 'path':
                for p in self.M.files[g]:
                    page = self.M.pages[p]
                    page.rotate_degrees = int(g)
                    page.rotate_method = page.USER
                    page.rotate_confidence = page.USER_SPECIFIED

    def generate_html(self):
        'Make a test page that shows the original and cropped files, side by side.'

        print "<html><body>"
        for p in self.M.pages:
            print '<img src="" width="400"> <img src="%s" width="400"/><br />' % f.path
        print "</body></html>"



    def main(self, opt):
        '''Do the entire commandline thing.'''

        count=0
        for g in self.M.files:
            for p in self.M.files[g]:
                page = self.M.pages[p]

                if os.path.exists('pause'):
                    sys.exit()

                fname = page.path

                count += 1
                comm.verbose("Processing %s (%d of %d)" % (fname, count, len(self.M.files[g])))
            
                if 'rotate' in opt:
                    rotater.rotate(self.M, p,**opt)
                if 'crop' in opt:
                    cropper.crop(self.M, p, **opt)

        sanity = cropper.SanityCheck(self.M)
        do_all_iterator(sanity.do_sanity_check)