~bkrpr/bookliberator/trunk : contents of splitter.py at revision 81

~bkrpr/bookliberator/trunk : (revision 81)
#!/usr/bin/python

import math
import graphic
from misc import *


class hocr_parser:
    '''Parse HTML-OCR files.
    Right now, the only thing this class does is get_bounding_boxes.
    '''
    hocr = []

    def __init__(self, fname):
        self.hocr = file2array(fname)

    def get_bounding_boxes(self):
        '''Extracts the bounding boxes from the hocr description of each line of text
        Returns an array of tuples.
        '''
        bboxes = []
        p = re.compile(r"ocr_line.*bbox (\d+) (\d+) (\d+) (\d+)")
        for l in self.hocr:
            m = p.search(l)
            if m:
                bboxes.append(map(int, m.groups()))
        return bboxes


class splitter():
    '''splitter(image, **opts)

    This class operates on images of text.  It separates that image
    into smaller images, each containing a picture of one line from
    that text.  It assumes somewhat parallel lines of dark text on a
    light background with no images.
    
    image is an image file spec, a pils image or a graphic object.

    Pass in a **dictionary with options:
         brightness = percent brightness to adjust before processing.  Final image is unenhanced.
         contrast = percent contrast to adjust to before processing.  Final image is unenhanced.
         tilt = the max degrees the text tilts from the horizon (optional)
    '''

    tiff = ''
    txt = ''
    opt={}

    def __init__(self, image, **opt):
        self.opt = set_defaults( {'brightness': 100,
                                  'contrast' : 100,
                                  'tilt': 4,
                                  'level' : False},
                                 opt)

        self.im = graphic.graphic(image)
        self.unenhanced = graphic.graphic(image)

    def get_blank_line(self, j, h_high):
        i = 0
        while i <= h_high - j:
            y = j + int(i + 0.5)
            r = self.im.whitespace((0,j), (self.im.w,y))
            if r > 80:
                return (j, y)

            i = (abs(i) + 0.5) * (-1 * ((i > 0) * 2 - 1))

        return -1, -1


    def get_cut_lines(self):
        '''find whitepace by trying to draw horizontalish lines that do not hit
        many black pixels'''

        height_range = int(calc_right_tri_leg(self.opt['tilt'], self.im.w))
        cut_line = []
        last = 'white'

        j = 0
        while j <= self.im.h:
            h_high = j+height_range
            if h_high > self.im.h: 
                h_high = self.im.h

            coords = self.get_blank_line(j, h_high)
            if coords[0] != -1:
                if last == 'black':
                    cut_line.append((0,coords[0], self.im.w, coords[1]))
                j = j + 5
                last = 'white'
            else:
                ## cycle through blank lines faster
                if last == 'white':
                    last = 'black'
                    j = j + 4
            j = j + 3

        return cut_line

    def level_line(self, region):
        if self.opt['level'] == False:
            return region

        ## Whiteout everything except the line of text
        b1, b2 = region.blackest_line()
        h1, h2 = region.whitest_line(0,int(mean(b1,b2)))
        h3, h4 = region.whitest_line(int(mean(b1,b2)), region.size[1])

        ## Rotate line of text parallel to horizon
        s = slope((0,b1), (self.im.w,b2))
        degrees = math.degrees(math.atan(s))
        region = region.rotate(degrees, expand=1) #TODO: get a better rotate algorithm

        region = region.trim(255) #TODO: use imagemagic's fuzzy trim?

        return region


    def do_cuts(self, cuts, **kwargs):
        last = (0,0,self.im.w,0)
        counter=0

        print "do_cuts"

        kwargs = set_defaults( {'draw' : False}, kwargs)
        if kwargs['draw']:
                    draw = ImageDraw.Draw(self.im.im)

        strips = []
        while counter < len(cuts):
            j = cuts[counter]
            print counter
            region = self.unenhanced.copy()
            if kwargs['draw']:
                draw = ImageDraw.Draw(region.im)
                draw.polygon(((0,0),(0,last[1]),(last[2],last[3]),(last[2],0)), fill=255)
                draw.polygon(((0,j[1]),(j[2],j[3]),(j[2],self.im.h), (0,self.im.h)), fill=255)
            region = region.crop((min(last[0], last[2]), min(last[1],last[3]),
                                   max(j[0], j[2]), max(j[1],j[3])))

            region = self.level_line(region)
            strips.append(region)
            counter = counter + 1
            last = j
        return strips
            
    def show_cuts(self, cuts):
        t = self.im.copy().im
        draw = ImageDraw.Draw(t)
        for j in cuts:
            draw.line(j)
        t.save('cuts.png')
        self.im.show()
        sys.exit()

    def get_bounding_boxes(self):
        if 'hocr' in self.opt:
            parser = hocr_parser(self.opt['hocr'])
            return parser.get_bounding_boxes()
        else:
            cuts = self.get_cut_lines()
            print cuts
            return cuts

    def cut_bbox(self, bbox):
        '''bbox is a list of tuples.  Each tuple has 4 elements.  Each
        element is a coordinate and the four coordinates are x,y of
        upper left and x,y of lower right of the bounding box.  '''

        cuts=[]
        last = (0,0,self.im.w,0)
        counter=0
        while counter < len(bbox):
            box = bbox[counter]

            region = self.im.copy()
            draw = ImageDraw.Draw(region.im)
            region = region.crop((box[0], box[1], box[2], box[3]))

            region = self.level_line(region)
            cuts.append(region)
            counter = counter + 1
            last = box

        return cuts

    def tune_bbox(self, bbox):
        '''Tune a bbox so it does not cut off letters.'''
        return bbox

    def tune_bboxes(self, bbox):
        '''Tune bboxes so they do not cut off letters.

        bbox is a list of 4-tuples containing left, top, right, bottom.'''

        ret = []
        for b in bbox:
            ret.append(self.tune_bbox(b))
        return ret


    def split(self):
        self.im.enhance(self.opt['brightness'], self.opt['contrast'])

        if 'hocr' in self.opt and self.opt['hocr']:
            parser = hocr_parser(self.opt['hocr'])
            bbox = parser.get_bounding_boxes()
            bbox = self.tune_bboxes(bbox)
            cuts = self.cut_bbox(bbox)
        else:
            cuts = self.do_cuts(self.get_cut_lines())

        return cuts