~vcs-imports/tesseract-ocr/trunk

199 by theraysmith
Changes to textord for 3.00
1
///////////////////////////////////////////////////////////////////////
2
// File:        strokewidth.cpp
3
// Description: Subclass of BBGrid to find uniformity of strokewidth.
4
// Author:      Ray Smith
5
// Created:     Mon Mar 31 16:17:01 PST 2008
6
//
7
// (C) Copyright 2008, Google Inc.
8
// Licensed under the Apache License, Version 2.0 (the "License");
9
// you may not use this file except in compliance with the License.
10
// You may obtain a copy of the License at
11
// http://www.apache.org/licenses/LICENSE-2.0
12
// Unless required by applicable law or agreed to in writing, software
13
// distributed under the License is distributed on an "AS IS" BASIS,
14
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
// See the License for the specific language governing permissions and
16
// limitations under the License.
17
//
18
///////////////////////////////////////////////////////////////////////
19
247 by joregan
disable MSVC warning C4244 in a number of places to cut down the noise
20
#ifdef _MSC_VER
21
#pragma warning(disable:4244)  // Conversion warnings
22
#endif
23
751 by zdenop
fix build with -DGRAPHICS_DISABLED
24
#ifdef HAVE_CONFIG_H
25
#include "config_auto.h"
26
#endif
27
199 by theraysmith
Changes to textord for 3.00
28
#include "strokewidth.h"
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
29
30
#include <math.h>
31
199 by theraysmith
Changes to textord for 3.00
32
#include "blobbox.h"
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
33
#include "colpartition.h"
34
#include "colpartitiongrid.h"
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
35
#include "imagefind.h"
36
#include "linlsq.h"
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
37
#include "statistc.h"
199 by theraysmith
Changes to textord for 3.00
38
#include "tabfind.h"
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
39
#include "textlineprojection.h"
199 by theraysmith
Changes to textord for 3.00
40
#include "tordmain.h"  // For SetBlobStrokeWidth.
41
42
namespace tesseract {
43
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
44
INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths");
45
BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths");
46
BOOL_VAR(textord_tabfind_vertical_text, true, "Enable vertical detection");
47
BOOL_VAR(textord_tabfind_force_vertical_text, false,
48
         "Force using vertical text page mode");
49
BOOL_VAR(textord_tabfind_vertical_horizontal_mix, true,
50
         "find horizontal lines such as headers in vertical page mode");
51
double_VAR(textord_tabfind_vertical_text_ratio, 0.5,
52
           "Fraction of textlines deemed vertical to use vertical page mode");
53
314 by joregan
more doxygen
54
/** Allowed proportional change in stroke width to be the same font. */
199 by theraysmith
Changes to textord for 3.00
55
const double kStrokeWidthFractionTolerance = 0.125;
314 by joregan
more doxygen
56
/**
57
 * Allowed constant change in stroke width to be the same font. 
58
 * Really 1.5 pixels.
59
 */
199 by theraysmith
Changes to textord for 3.00
60
const double kStrokeWidthTolerance = 1.5;
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
61
// Same but for CJK we are a bit more generous.
62
const double kStrokeWidthFractionCJK = 0.25;
63
const double kStrokeWidthCJK = 2.0;
64
// Radius in grid cells of search for broken CJK. Doesn't need to be very
65
// large as the grid size should be about the size of a character anyway.
66
const int kCJKRadius = 2;
67
// Max distance fraction of size to join close but broken CJK characters.
68
const double kCJKBrokenDistanceFraction = 0.25;
69
// Max number of components in a broken CJK character.
70
const int kCJKMaxComponents = 8;
71
// Max aspect ratio of CJK broken characters when put back together.
72
const double kCJKAspectRatio = 1.25;
73
// Max increase in aspect ratio of CJK broken characters when merged.
74
const double kCJKAspectRatioIncrease = 1.0625;
75
// Max multiple of the grid size that will be used in computing median CJKsize.
76
const int kMaxCJKSizeRatio = 5;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
77
// Min fraction of blobs broken CJK to iterate and run it again.
78
const double kBrokenCJKIterationFraction = 0.125;
79
// Multiple of gridsize as x-padding for a search box for diacritic base
80
// characters.
81
const double kDiacriticXPadRatio = 7.0;
82
// Multiple of gridsize as y-padding for a search box for diacritic base
83
// characters.
84
const double kDiacriticYPadRatio = 1.75;
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
85
// Min multiple of diacritic height that a neighbour must be to be a
86
// convincing base character.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
87
const double kMinDiacriticSizeRatio = 1.0625;
88
// Max multiple of a textline's median height as a threshold for the sum of
89
// a diacritic's farthest x and y distances (gap + size).
90
const double kMaxDiacriticDistanceRatio = 1.25;
91
// Max x-gap between a diacritic and its base char as a fraction of the height
92
// of the base char (allowing other blobs to fill the gap.)
93
const double kMaxDiacriticGapToBaseCharHeight = 1.0;
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
94
// Radius of a search for diacritics in grid units.
95
const int kSearchRadius = 2;
96
// Ratio between longest side of a line and longest side of a character.
97
// (neighbor_min > blob_min * kLineTrapShortest &&
98
//  neighbor_max < blob_max / kLineTrapLongest)
99
// => neighbor is a grapheme and blob is a line.
100
const int kLineTrapLongest = 4;
101
// Ratio between shortest side of a line and shortest side of a character.
102
const int kLineTrapShortest = 2;
103
// Max aspect ratio of the total box before CountNeighbourGaps
104
// decides immediately based on the aspect ratio.
105
const int kMostlyOneDirRatio = 3;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
106
// Aspect ratio for a blob to be considered as line residue.
107
const double kLineResidueAspectRatio = 8.0;
108
// Padding ratio for line residue search box.
109
const int kLineResiduePadRatio = 3;
110
// Min multiple of neighbour size for a line residue to be genuine.
111
const double kLineResidueSizeRatio = 1.75;
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
112
// Aspect ratio filter for OSD.
113
const float kSizeRatioToReject = 2.0;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
114
// Max number of normal blobs a large blob may overlap before it is rejected
115
// and determined to be image
116
const int kMaxLargeOverlaps = 3;
117
// Expansion factor for search box for good neighbours.
118
const double kNeighbourSearchFactor = 2.5;
199 by theraysmith
Changes to textord for 3.00
119
120
StrokeWidth::StrokeWidth(int gridsize,
121
                         const ICOORD& bleft, const ICOORD& tright)
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
122
  : BlobGrid(gridsize, bleft, tright), nontext_map_(NULL), projection_(NULL),
123
    denorm_(NULL), grid_box_(bleft, tright), rerotation_(1.0f, 0.0f) {
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
124
  leaders_win_ = NULL;
125
  widths_win_ = NULL;
126
  initial_widths_win_ = NULL;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
127
  chains_win_ = NULL;
128
  diacritics_win_ = NULL;
129
  textlines_win_ = NULL;
130
  smoothed_win_ = NULL;
199 by theraysmith
Changes to textord for 3.00
131
}
132
133
StrokeWidth::~StrokeWidth() {
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
134
  if (widths_win_ != NULL) {
519 by zdenop at gmail
fix for GRAPHICS_DISABLED build
135
    #ifndef GRAPHICS_DISABLED
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
136
    delete widths_win_->AwaitEvent(SVET_DESTROY);
519 by zdenop at gmail
fix for GRAPHICS_DISABLED build
137
    #endif  // GRAPHICS_DISABLED
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
138
    if (textord_tabfind_only_strokewidths)
139
      exit(0);
140
    delete widths_win_;
141
  }
142
  delete leaders_win_;
143
  delete initial_widths_win_;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
144
  delete chains_win_;
145
  delete textlines_win_;
146
  delete smoothed_win_;
147
  delete diacritics_win_;
148
}
149
150
// Sets the neighbours member of the medium-sized blobs in the block.
151
// Searches on 4 sides of each blob for similar-sized, similar-strokewidth
152
// blobs and sets pointers to the good neighbours.
153
void StrokeWidth::SetNeighboursOnMediumBlobs(TO_BLOCK* block) {
154
  // Run a preliminary strokewidth neighbour detection on the medium blobs.
155
  InsertBlobList(&block->blobs);
156
  BLOBNBOX_IT blob_it(&block->blobs);
157
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
158
    SetNeighbours(false, false, blob_it.data());
159
  }
160
  Clear();
161
}
162
163
// Sets the neighbour/textline writing direction members of the medium
164
// and large blobs with optional repair of broken CJK characters first.
165
// Repair of broken CJK is needed here because broken CJK characters
166
// can fool the textline direction detection algorithm.
167
void StrokeWidth::FindTextlineDirectionAndFixBrokenCJK(bool cjk_merge,
168
                                                       TO_BLOCK* input_block) {
169
  // Setup the grid with the remaining (non-noise) blobs.
170
  InsertBlobs(input_block);
171
  // Repair broken CJK characters if needed.
172
  while (cjk_merge && FixBrokenCJK(input_block));
173
  // Grade blobs by inspection of neighbours.
174
  FindTextlineFlowDirection(false);
175
  // Clear the grid ready for rotation or leader finding.
176
  Clear();
177
}
178
179
// Helper to collect and count horizontal and vertical blobs from a list.
180
static void CollectHorizVertBlobs(BLOBNBOX_LIST* input_blobs,
181
                                  int* num_vertical_blobs,
182
                                  int* num_horizontal_blobs,
183
                                  BLOBNBOX_CLIST* vertical_blobs,
184
                                  BLOBNBOX_CLIST* horizontal_blobs,
185
                                  BLOBNBOX_CLIST* nondescript_blobs) {
186
  BLOBNBOX_C_IT v_it(vertical_blobs);
187
  BLOBNBOX_C_IT h_it(horizontal_blobs);
188
  BLOBNBOX_C_IT n_it(nondescript_blobs);
189
  BLOBNBOX_IT blob_it(input_blobs);
190
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
191
    BLOBNBOX* blob = blob_it.data();
192
    const TBOX& box = blob->bounding_box();
193
    float y_x = static_cast<float>(box.height()) / box.width();
194
    float x_y = 1.0f / y_x;
195
    // Select a >= 1.0 ratio
196
    float ratio = x_y > y_x ? x_y : y_x;
197
    // If the aspect ratio is small and we want them for osd, save the blob.
198
    bool ok_blob = ratio <= kSizeRatioToReject;
199
    if (blob->UniquelyVertical()) {
200
      ++*num_vertical_blobs;
201
      if (ok_blob) v_it.add_after_then_move(blob);
202
    } else if (blob->UniquelyHorizontal()) {
203
      ++*num_horizontal_blobs;
204
      if (ok_blob) h_it.add_after_then_move(blob);
205
    } else if (ok_blob) {
206
      n_it.add_after_then_move(blob);
207
    }
208
  }
209
}
210
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
211
212
// Types all the blobs as vertical or horizontal text or unknown and
213
// returns true if the majority are vertical.
214
// If the blobs are rotated, it is necessary to call CorrectForRotation
215
// after rotating everything, otherwise the work done here will be enough.
216
// If osd_blobs is not null, a list of blobs from the dominant textline
217
// direction are returned for use in orientation and script detection.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
218
bool StrokeWidth::TestVerticalTextDirection(TO_BLOCK* block,
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
219
                                            BLOBNBOX_CLIST* osd_blobs) {
220
  if (textord_tabfind_force_vertical_text) return true;
221
  if (!textord_tabfind_vertical_text) return false;
222
223
  int vertical_boxes = 0;
224
  int horizontal_boxes = 0;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
225
  // Count vertical normal and large blobs.
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
226
  BLOBNBOX_CLIST vertical_blobs;
227
  BLOBNBOX_CLIST horizontal_blobs;
228
  BLOBNBOX_CLIST nondescript_blobs;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
229
  CollectHorizVertBlobs(&block->blobs, &vertical_boxes, &horizontal_boxes,
230
                        &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
231
  CollectHorizVertBlobs(&block->large_blobs, &vertical_boxes, &horizontal_boxes,
232
                        &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
233
  if (textord_debug_tabfind)
234
    tprintf("TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\n",
235
            horizontal_boxes, vertical_boxes,
236
            horizontal_blobs.length(), vertical_blobs.length(),
237
            nondescript_blobs.length());
238
  if (osd_blobs != NULL && vertical_boxes == 0 && horizontal_boxes == 0) {
239
    // Only nondescript blobs available, so return those.
240
    BLOBNBOX_C_IT osd_it(osd_blobs);
241
    osd_it.add_list_after(&nondescript_blobs);
242
    return false;
243
  }
244
  int min_vert_boxes = static_cast<int>((vertical_boxes + horizontal_boxes) *
245
                                        textord_tabfind_vertical_text_ratio);
246
  if (vertical_boxes >= min_vert_boxes) {
247
    if (osd_blobs != NULL) {
248
      BLOBNBOX_C_IT osd_it(osd_blobs);
249
      osd_it.add_list_after(&vertical_blobs);
250
    }
251
    return true;
252
  } else {
253
    if (osd_blobs != NULL) {
254
      BLOBNBOX_C_IT osd_it(osd_blobs);
255
      osd_it.add_list_after(&horizontal_blobs);
256
    }
257
    return false;
258
  }
259
}
260
261
// Corrects the data structures for the given rotation.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
262
void StrokeWidth::CorrectForRotation(const FCOORD& rotation,
263
                                     ColPartitionGrid* part_grid) {
264
  Init(part_grid->gridsize(), part_grid->bleft(), part_grid->tright());
265
  grid_box_ = TBOX(bleft(), tright());
266
  rerotation_.set_x(rotation.x());
267
  rerotation_.set_y(-rotation.y());
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
268
}
269
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
270
// Finds leader partitions and inserts them into the given part_grid.
271
void StrokeWidth::FindLeaderPartitions(TO_BLOCK* block,
272
                                       ColPartitionGrid* part_grid) {
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
273
  Clear();
274
  // Find and isolate leaders in the noise list.
275
  ColPartition_LIST leader_parts;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
276
  FindLeadersAndMarkNoise(block, &leader_parts);
277
  // Setup the strokewidth grid with the block's remaining (non-noise) blobs.
278
  InsertBlobList(&block->blobs);
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
279
  // Mark blobs that have leader neighbours.
280
  for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) {
281
    ColPartition* part = it.extract();
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
282
    part->ClaimBoxes();
283
    MarkLeaderNeighbours(part, LR_LEFT);
284
    MarkLeaderNeighbours(part, LR_RIGHT);
285
    part_grid->InsertBBox(true, true, part);
286
  }
287
}
288
289
// Finds and marks noise those blobs that look like bits of vertical lines
290
// that would otherwise screw up layout analysis.
291
void StrokeWidth::RemoveLineResidue(ColPartition_LIST* big_part_list) {
292
  BlobGridSearch gsearch(this);
293
  BLOBNBOX* bbox;
294
  // For every vertical line-like bbox in the grid, search its neighbours
295
  // to find the tallest, and if the original box is taller by sufficient
296
  // margin, then call it line residue and delete it.
297
  gsearch.StartFullSearch();
298
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
299
    TBOX box = bbox->bounding_box();
300
    if (box.height() < box.width() * kLineResidueAspectRatio)
301
      continue;
302
    // Set up a rectangle search around the blob to find the size of its
303
    // neighbours.
304
    int padding = box.height() * kLineResiduePadRatio;
305
    TBOX search_box = box;
306
    search_box.pad(padding, padding);
307
    bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
308
                                               box.bottom());
309
    // Find the largest object in the search box not equal to bbox.
310
    BlobGridSearch rsearch(this);
311
    int max_size = 0;
312
    BLOBNBOX* n;
313
    rsearch.StartRectSearch(search_box);
314
    while ((n = rsearch.NextRectSearch()) != NULL) {
315
      if (n == bbox) continue;
316
      TBOX nbox = n->bounding_box();
317
      if (nbox.height() > max_size) {
318
        max_size = nbox.height();
319
      }
320
    }
321
    if (debug) {
322
      tprintf("Max neighbour size=%d for candidate line box at:", max_size);
323
      box.print();
324
    }
325
    if (max_size * kLineResidueSizeRatio < box.height()) {
519 by zdenop at gmail
fix for GRAPHICS_DISABLED build
326
      #ifndef GRAPHICS_DISABLED
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
327
      if (leaders_win_ != NULL) {
328
        // We are debugging, so display deleted in pink blobs in the same
329
        // window that we use to display leader detection.
330
        leaders_win_->Pen(ScrollView::PINK);
331
        leaders_win_->Rectangle(box.left(), box.bottom(),
332
                                box.right(), box.top());
333
      }
519 by zdenop at gmail
fix for GRAPHICS_DISABLED build
334
      #endif  // GRAPHICS_DISABLED
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
335
      ColPartition::MakeBigPartition(bbox, big_part_list);
336
    }
337
  }
338
}
339
340
// Types all the blobs as vertical text or horizontal text or unknown and
341
// puts them into initial ColPartitions in the supplied part_grid.
342
// rerotation determines how to get back to the image coordinates from the
343
// blob coordinates (since they may have been rotated for vertical text).
344
// block is the single block for the whole page or rectangle to be OCRed.
345
// nontext_pix (full-size), is a binary mask used to prevent merges across
346
// photo/text boundaries. It is not kept beyond this function.
347
// denorm provides a mapping back to the image from the current blob
348
// coordinate space.
349
// projection provides a measure of textline density over the image and
350
// provides functions to assist with diacritic detection. It should be a
351
// pointer to a new TextlineProjection, and will be setup here.
352
// part_grid is the output grid of textline partitions.
353
// Large blobs that cause overlap are put in separate partitions and added
354
// to the big_parts list.
355
void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation,
356
                                           TO_BLOCK* block,
357
                                           Pix* nontext_pix,
358
                                           const DENORM* denorm,
789 by theraysmith at gmail
Fixed issue 979
359
                                           bool cjk_script,
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
360
                                           TextlineProjection* projection,
361
                                           ColPartitionGrid* part_grid,
362
                                           ColPartition_LIST* big_parts) {
363
  nontext_map_ = nontext_pix;
364
  projection_ = projection;
365
  denorm_ = denorm;
366
  // Clear and re Insert to take advantage of the tab stops in the blobs.
367
  Clear();
368
  // Setup the strokewidth grid with the remaining non-noise, non-leader blobs.
369
  InsertBlobs(block);
370
789 by theraysmith at gmail
Fixed issue 979
371
  // Run FixBrokenCJK() again if the page is CJK.
372
  if (cjk_script) {
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
373
    FixBrokenCJK(block);
374
  }
375
  FindTextlineFlowDirection(true);
376
  projection_->ConstructProjection(block, rerotation, nontext_map_);
377
  if (textord_tabfind_show_strokewidths) {
378
    ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs");
379
    projection_->PlotGradedBlobs(&block->blobs, line_blobs_win);
380
    projection_->PlotGradedBlobs(&block->small_blobs, line_blobs_win);
381
  }
382
  projection_->MoveNonTextlineBlobs(&block->blobs, &block->noise_blobs);
383
  projection_->MoveNonTextlineBlobs(&block->small_blobs, &block->noise_blobs);
384
  // Clear and re Insert to take advantage of the removed diacritics.
385
  Clear();
386
  InsertBlobs(block);
387
  FindInitialPartitions(rerotation, block, part_grid, big_parts);
388
  nontext_map_ = NULL;
389
  projection_ = NULL;
390
  denorm_ = NULL;
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
391
}
392
393
static void PrintBoxWidths(BLOBNBOX* neighbour) {
394
  TBOX nbox = neighbour->bounding_box();
395
  tprintf("Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n",
396
          nbox.left(), nbox.bottom(), nbox.right(), nbox.top(),
397
          neighbour->horz_stroke_width(), neighbour->vert_stroke_width(),
398
          2.0 * neighbour->cblob()->area()/neighbour->cblob()->perimeter());
399
}
400
401
/** Handles a click event in a display window. */
402
void StrokeWidth::HandleClick(int x, int y) {
403
  BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>::HandleClick(x, y);
404
  // Run a radial search for blobs that overlap.
405
  BlobGridSearch radsearch(this);
406
  radsearch.StartRadSearch(x, y, 1);
407
  BLOBNBOX* neighbour;
408
  FCOORD click(static_cast<float>(x), static_cast<float>(y));
409
  while ((neighbour = radsearch.NextRadSearch()) != NULL) {
410
    TBOX nbox = neighbour->bounding_box();
411
    if (nbox.contains(click) && neighbour->cblob() != NULL) {
412
      PrintBoxWidths(neighbour);
413
      if (neighbour->neighbour(BND_LEFT) != NULL)
414
        PrintBoxWidths(neighbour->neighbour(BND_LEFT));
415
      if (neighbour->neighbour(BND_RIGHT) != NULL)
416
        PrintBoxWidths(neighbour->neighbour(BND_RIGHT));
417
      if (neighbour->neighbour(BND_ABOVE) != NULL)
418
        PrintBoxWidths(neighbour->neighbour(BND_ABOVE));
419
      if (neighbour->neighbour(BND_BELOW) != NULL)
420
        PrintBoxWidths(neighbour->neighbour(BND_BELOW));
421
      int gaps[BND_COUNT];
422
      neighbour->NeighbourGaps(gaps);
423
      tprintf("Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\n"
424
              "Good=    %d        %d        %d        %d\n",
425
              gaps[BND_LEFT], gaps[BND_RIGHT],
426
              gaps[BND_ABOVE], gaps[BND_BELOW],
427
              neighbour->horz_possible(),
428
              neighbour->vert_possible(),
429
              neighbour->good_stroke_neighbour(BND_LEFT),
430
              neighbour->good_stroke_neighbour(BND_RIGHT),
431
              neighbour->good_stroke_neighbour(BND_ABOVE),
432
              neighbour->good_stroke_neighbour(BND_BELOW));
433
      break;
434
    }
435
  }
436
}
437
438
// Detects and marks leader dots/dashes.
439
//    Leaders are horizontal chains of small or noise blobs that look
440
//    monospace according to ColPartition::MarkAsLeaderIfMonospaced().
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
441
// Detected leaders become the only occupants of the block->small_blobs list.
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
442
// Non-leader small blobs get moved to the blobs list.
443
// Non-leader noise blobs remain singletons in the noise list.
444
// All small and noise blobs in high density regions are marked BTFT_NONTEXT.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
445
// block is the single block for the whole page or rectangle to be OCRed.
446
// leader_parts is the output.
447
void StrokeWidth::FindLeadersAndMarkNoise(TO_BLOCK* block,
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
448
                                          ColPartition_LIST* leader_parts) {
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
449
  InsertBlobList(&block->small_blobs);
450
  InsertBlobList(&block->noise_blobs);
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
451
  BlobGridSearch gsearch(this);
452
  BLOBNBOX* bbox;
453
  // For every bbox in the grid, set its neighbours.
454
  gsearch.StartFullSearch();
455
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
456
    SetNeighbours(true, false, bbox);
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
457
  }
458
  ColPartition_IT part_it(leader_parts);
459
  gsearch.StartFullSearch();
460
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
461
    if (bbox->flow() == BTFT_NONE) {
462
      if (bbox->neighbour(BND_RIGHT) == NULL &&
463
          bbox->neighbour(BND_LEFT) == NULL)
464
        continue;
465
      // Put all the linked blobs into a ColPartition.
466
      ColPartition* part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
467
      BLOBNBOX* blob;
468
      for (blob = bbox; blob != NULL && blob->flow() == BTFT_NONE;
469
           blob = blob->neighbour(BND_RIGHT))
470
        part->AddBox(blob);
471
      for (blob = bbox->neighbour(BND_LEFT); blob != NULL &&
472
           blob->flow() == BTFT_NONE;
473
           blob = blob->neighbour(BND_LEFT))
474
        part->AddBox(blob);
475
      if (part->MarkAsLeaderIfMonospaced())
476
        part_it.add_after_then_move(part);
477
      else
478
        delete part;
479
    }
480
  }
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
481
  if (textord_tabfind_show_strokewidths) {
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
482
    leaders_win_ = DisplayGoodBlobs("LeaderNeighbours", 0, 0);
483
  }
484
  // Move any non-leaders from the small to the blobs list, as they are
485
  // most likely dashes or broken characters.
486
  BLOBNBOX_IT blob_it(&block->blobs);
487
  BLOBNBOX_IT small_it(&block->small_blobs);
488
  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
489
    BLOBNBOX* blob = small_it.data();
490
    if (blob->flow() != BTFT_LEADER) {
491
      if (blob->flow() == BTFT_NEIGHBOURS)
492
        blob->set_flow(BTFT_NONE);
493
      blob->ClearNeighbours();
494
      blob_it.add_to_end(small_it.extract());
495
    }
496
  }
497
  // Move leaders from the noise list to the small list, leaving the small
498
  // list exclusively leaders, so they don't get processed further,
499
  // and the remaining small blobs all in the noise list.
500
  BLOBNBOX_IT noise_it(&block->noise_blobs);
501
  for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) {
502
    BLOBNBOX* blob = noise_it.data();
503
    if (blob->flow() == BTFT_LEADER || blob->joined_to_prev()) {
504
      small_it.add_to_end(noise_it.extract());
505
    } else if (blob->flow() == BTFT_NEIGHBOURS) {
506
      blob->set_flow(BTFT_NONE);
507
      blob->ClearNeighbours();
508
    }
509
  }
510
  // Clear the grid as we don't want the small stuff hanging around in it.
511
  Clear();
199 by theraysmith
Changes to textord for 3.00
512
}
513
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
514
/** Inserts the block blobs (normal and large) into this grid.
515
 * Blobs remain owned by the block. */
516
void StrokeWidth::InsertBlobs(TO_BLOCK* block) {
517
  InsertBlobList(&block->blobs);
518
  InsertBlobList(&block->large_blobs);
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
519
}
520
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
521
// Checks the left or right side of the given leader partition and sets the
522
// (opposite) leader_on_right or leader_on_left flags for blobs
523
// that are next to the given side of the given leader partition.
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
524
void StrokeWidth::MarkLeaderNeighbours(const ColPartition* part,
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
525
                                       LeftOrRight side) {
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
526
  const TBOX& part_box = part->bounding_box();
527
  BlobGridSearch blobsearch(this);
528
  // Search to the side of the leader for the nearest neighbour.
529
  BLOBNBOX* best_blob = NULL;
530
  int best_gap = 0;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
531
  blobsearch.StartSideSearch(side == LR_LEFT ? part_box.left()
532
                                             : part_box.right(),
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
533
                             part_box.bottom(), part_box.top());
534
  BLOBNBOX* blob;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
535
  while ((blob = blobsearch.NextSideSearch(side == LR_LEFT)) != NULL) {
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
536
    const TBOX& blob_box = blob->bounding_box();
537
    if (!blob_box.y_overlap(part_box))
538
      continue;
539
    int x_gap = blob_box.x_gap(part_box);
540
    if (x_gap > 2 * gridsize()) {
541
      break;
542
    } else if (best_blob == NULL || x_gap < best_gap) {
543
      best_blob = blob;
544
      best_gap = x_gap;
545
    }
546
  }
547
  if (best_blob != NULL) {
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
548
    if (side == LR_LEFT)
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
549
      best_blob->set_leader_on_right(true);
550
    else
551
      best_blob->set_leader_on_left(true);
519 by zdenop at gmail
fix for GRAPHICS_DISABLED build
552
    #ifndef GRAPHICS_DISABLED
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
553
    if (leaders_win_ != NULL) {
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
554
      leaders_win_->Pen(side == LR_LEFT ? ScrollView::RED : ScrollView::GREEN);
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
555
      const TBOX& blob_box = best_blob->bounding_box();
556
      leaders_win_->Rectangle(blob_box.left(), blob_box.bottom(),
557
                              blob_box.right(), blob_box.top());
558
    }
519 by zdenop at gmail
fix for GRAPHICS_DISABLED build
559
    #endif  // GRAPHICS_DISABLED
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
560
  }
561
}
562
563
// Helper to compute the UQ of the square-ish CJK charcters.
564
static int UpperQuartileCJKSize(int gridsize, BLOBNBOX_LIST* blobs) {
565
  STATS sizes(0, gridsize * kMaxCJKSizeRatio);
566
  BLOBNBOX_IT it(blobs);
567
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
568
    BLOBNBOX* blob = it.data();
569
    int width = blob->bounding_box().width();
570
    int height = blob->bounding_box().height();
571
    if (width <= height * kCJKAspectRatio && height < width * kCJKAspectRatio)
572
      sizes.add(height, 1);
573
  }
574
  return static_cast<int>(sizes.ile(0.75f) + 0.5);
575
}
576
577
// Fix broken CJK characters, using the fake joined blobs mechanism.
578
// Blobs are really merged, ie the master takes all the outlines and the
579
// others are deleted.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
580
// Returns true if sufficient blobs are merged that it may be worth running
581
// again, due to a better estimate of character size.
582
bool StrokeWidth::FixBrokenCJK(TO_BLOCK* block) {
583
  BLOBNBOX_LIST* blobs = &block->blobs;
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
584
  int median_height = UpperQuartileCJKSize(gridsize(), blobs);
585
  int max_dist = static_cast<int>(median_height * kCJKBrokenDistanceFraction);
586
  int max_size = static_cast<int>(median_height * kCJKAspectRatio);
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
587
  int num_fixed = 0;
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
588
  BLOBNBOX_IT blob_it(blobs);
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
589
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
590
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
591
    BLOBNBOX* blob = blob_it.data();
592
    if (blob->cblob() == NULL || blob->cblob()->out_list()->empty())
593
      continue;
594
    TBOX bbox = blob->bounding_box();
595
    bool debug = AlignedBlob::WithinTestRegion(3, bbox.left(),
596
                                               bbox.bottom());
597
    if (debug) {
598
      tprintf("Checking for Broken CJK (max size=%d):", max_size);
599
      bbox.print();
600
    }
601
    // Generate a list of blobs that overlap or are near enough to merge.
602
    BLOBNBOX_CLIST overlapped_blobs;
603
    AccumulateOverlaps(blob, debug, max_size, max_dist,
604
                       &bbox, &overlapped_blobs);
605
    if (!overlapped_blobs.empty()) {
606
      // There are overlapping blobs, so qualify them as being satisfactory
607
      // before removing them from the grid and replacing them with the union.
608
      // The final box must be roughly square.
609
      if (bbox.width() > bbox.height() * kCJKAspectRatio ||
610
          bbox.height() > bbox.width() * kCJKAspectRatio) {
611
        if (debug) {
612
          tprintf("Bad final aspectratio:");
613
          bbox.print();
614
        }
615
        continue;
616
      }
617
      // There can't be too many blobs to merge.
618
      if (overlapped_blobs.length() >= kCJKMaxComponents) {
619
        if (debug)
620
          tprintf("Too many neighbours: %d\n", overlapped_blobs.length());
621
        continue;
622
      }
623
      // The strokewidths must match amongst the join candidates.
624
      BLOBNBOX_C_IT n_it(&overlapped_blobs);
625
      for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
626
        BLOBNBOX* neighbour = NULL;
627
        neighbour = n_it.data();
628
        if (!blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionCJK,
629
                                       kStrokeWidthCJK))
630
          break;
631
      }
632
      if (!n_it.cycled_list()) {
633
        if (debug) {
634
          tprintf("Bad stroke widths:");
635
          PrintBoxWidths(blob);
636
        }
637
        continue;  // Not good enough.
638
      }
639
640
      // Merge all the candidates into blob.
641
      // We must remove blob from the grid and reinsert it after merging
642
      // to maintain the integrity of the grid.
643
      RemoveBBox(blob);
644
      // Everything else will be calculated later.
645
      for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
646
        BLOBNBOX* neighbour = n_it.data();
647
        RemoveBBox(neighbour);
623 by theraysmith at gmail
Added sparse text mode, also fixed issue 653.
648
        // Mark empty blob for deletion.
649
        neighbour->set_region_type(BRT_NOISE);
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
650
        blob->really_merge(neighbour);
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
651
        if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {
652
          blob->rotate_box(rerotation_);
653
        }
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
654
      }
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
655
      InsertBBox(true, true, blob);
656
      ++num_fixed;
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
657
      if (debug) {
658
        tprintf("Done! Final box:");
659
        bbox.print();
660
      }
661
    }
662
  }
623 by theraysmith at gmail
Added sparse text mode, also fixed issue 653.
663
  // Count remaining blobs.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
664
  int num_remaining = 0;
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
665
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
666
    BLOBNBOX* blob = blob_it.data();
623 by theraysmith at gmail
Added sparse text mode, also fixed issue 653.
667
    if (blob->cblob() != NULL && !blob->cblob()->out_list()->empty()) {
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
668
      ++num_remaining;
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
669
    }
670
  }
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
671
  // Permanently delete all the marked blobs after first removing all
672
  // references in the neighbour members.
673
  block->DeleteUnownedNoise();
674
  return num_fixed > num_remaining * kBrokenCJKIterationFraction;
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
675
}
676
677
// Helper function to determine whether it is reasonable to merge the
678
// bbox and the nbox for repairing broken CJK.
679
// The distance apart must not exceed max_dist, the combined size must
680
// not exceed max_size, and the aspect ratio must either improve or at
681
// least not get worse by much.
682
static bool AcceptableCJKMerge(const TBOX& bbox, const TBOX& nbox,
683
                               bool debug, int max_size, int max_dist,
684
                               int* x_gap, int* y_gap) {
685
  *x_gap = bbox.x_gap(nbox);
686
  *y_gap = bbox.y_gap(nbox);
687
  TBOX merged(nbox);
688
  merged += bbox;
689
  if (debug) {
690
    tprintf("gaps = %d, %d, merged_box:", *x_gap, *y_gap);
691
    merged.print();
692
  }
693
  if (*x_gap <= max_dist && *y_gap <= max_dist &&
694
      merged.width() <= max_size && merged.height() <= max_size) {
695
    // Close enough to call overlapping. Check aspect ratios.
696
    double old_ratio = static_cast<double>(bbox.width()) / bbox.height();
697
    if (old_ratio < 1.0) old_ratio = 1.0 / old_ratio;
698
    double new_ratio = static_cast<double>(merged.width()) / merged.height();
699
    if (new_ratio < 1.0) new_ratio = 1.0 / new_ratio;
700
    if (new_ratio <= old_ratio * kCJKAspectRatioIncrease)
701
      return true;
702
  }
703
  return false;
704
}
705
706
// Collect blobs that overlap or are within max_dist of the input bbox.
707
// Return them in the list of blobs and expand the bbox to be the union
708
// of all the boxes. not_this is excluded from the search, as are blobs
709
// that cause the merged box to exceed max_size in either dimension.
710
void StrokeWidth::AccumulateOverlaps(const BLOBNBOX* not_this, bool debug,
711
                                     int max_size, int max_dist,
712
                                     TBOX* bbox, BLOBNBOX_CLIST* blobs) {
713
  // While searching, nearests holds the nearest failed blob in each
714
  // direction. When we have a nearest in each of the 4 directions, then
715
  // the search is over, and at this point the final bbox must not overlap
716
  // any of the nearests.
717
  BLOBNBOX* nearests[BND_COUNT];
718
  for (int i = 0; i < BND_COUNT; ++i) {
719
    nearests[i] = NULL;
720
  }
721
  int x = (bbox->left() + bbox->right()) / 2;
722
  int y = (bbox->bottom() + bbox->top()) / 2;
723
  // Run a radial search for blobs that overlap or are sufficiently close.
724
  BlobGridSearch radsearch(this);
725
  radsearch.StartRadSearch(x, y, kCJKRadius);
726
  BLOBNBOX* neighbour;
727
  while ((neighbour = radsearch.NextRadSearch()) != NULL) {
728
    if (neighbour == not_this) continue;
729
    TBOX nbox = neighbour->bounding_box();
730
    int x_gap, y_gap;
731
    if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist,
732
                           &x_gap, &y_gap)) {
733
      // Close enough to call overlapping. Merge boxes.
734
      *bbox += nbox;
735
      blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
736
      if (debug) {
737
        tprintf("Added:");
738
        nbox.print();
739
      }
740
      // Since we merged, search the nearests, as some might now me mergeable.
741
      for (int dir = 0; dir < BND_COUNT; ++dir) {
742
        if (nearests[dir] == NULL) continue;
743
        nbox = nearests[dir]->bounding_box();
744
        if (AcceptableCJKMerge(*bbox, nbox, debug, max_size,
745
                               max_dist, &x_gap, &y_gap)) {
746
          // Close enough to call overlapping. Merge boxes.
747
          *bbox += nbox;
748
          blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, nearests[dir]);
749
          if (debug) {
750
            tprintf("Added:");
751
            nbox.print();
752
          }
753
          nearests[dir] = NULL;
754
          dir = -1;  // Restart the search.
755
        }
756
      }
757
    } else if (x_gap < 0 && x_gap <= y_gap) {
758
      // A vertical neighbour. Record the nearest.
759
      BlobNeighbourDir dir = nbox.top() > bbox->top() ? BND_ABOVE : BND_BELOW;
760
      if (nearests[dir] == NULL ||
761
          y_gap < bbox->y_gap(nearests[dir]->bounding_box())) {
762
        nearests[dir] = neighbour;
763
      }
764
    } else if (y_gap < 0 && y_gap <= x_gap) {
765
      // A horizontal neighbour. Record the nearest.
766
      BlobNeighbourDir dir = nbox.left() > bbox->left() ? BND_RIGHT : BND_LEFT;
767
      if (nearests[dir] == NULL ||
768
          x_gap < bbox->x_gap(nearests[dir]->bounding_box())) {
769
        nearests[dir] = neighbour;
770
      }
771
    }
772
    // If all nearests are non-null, then we have finished.
773
    if (nearests[BND_LEFT] && nearests[BND_RIGHT] &&
774
        nearests[BND_ABOVE] && nearests[BND_BELOW])
775
      break;
776
  }
777
  // Final overlap with a nearest is not allowed.
778
  for (int dir = 0; dir < BND_COUNT; ++dir) {
779
    if (nearests[dir] == NULL) continue;
780
    const TBOX& nbox = nearests[dir]->bounding_box();
781
    if (debug) {
782
      tprintf("Testing for overlap with:");
783
      nbox.print();
784
    }
785
    if (bbox->overlap(nbox)) {
786
      blobs->shallow_clear();
787
      if (debug)
788
        tprintf("Final box overlaps nearest\n");
789
      return;
790
    }
791
  }
792
}
793
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
794
// For each blob in this grid, Finds the textline direction to be horizontal
795
// or vertical according to distance to neighbours and 1st and 2nd order
796
// neighbours. Non-text tends to end up without a definite direction.
797
// Result is setting of the neighbours and vert_possible/horz_possible
798
// flags in the BLOBNBOXes currently in this grid.
799
// This function is called more than once if page orientation is uncertain,
800
// so display_if_debugging is true on the final call to display the results.
801
void StrokeWidth::FindTextlineFlowDirection(bool display_if_debugging) {
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
802
  BlobGridSearch gsearch(this);
803
  BLOBNBOX* bbox;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
804
  // For every bbox in the grid, set its neighbours.
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
805
  gsearch.StartFullSearch();
806
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
807
    SetNeighbours(false, display_if_debugging, bbox);
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
808
  }
809
  // Where vertical or horizontal wins by a big margin, clarify it.
810
  gsearch.StartFullSearch();
811
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
812
    SimplifyObviousNeighbours(bbox);
813
  }
814
  // Now try to make the blobs only vertical or horizontal using neighbours.
815
  gsearch.StartFullSearch();
816
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
817
    SetNeighbourFlows(bbox);
818
  }
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
819
  if ((textord_tabfind_show_strokewidths  && display_if_debugging) ||
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
820
      textord_tabfind_show_strokewidths > 1) {
821
    initial_widths_win_ = DisplayGoodBlobs("InitialStrokewidths", 400, 0);
822
  }
823
  // Improve flow direction with neighbours.
824
  gsearch.StartFullSearch();
825
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
826
    SmoothNeighbourTypes(bbox, false);
827
  }
828
  // Now allow reset of firm values to fix renegades.
829
  gsearch.StartFullSearch();
830
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
831
    SmoothNeighbourTypes(bbox, true);
832
  }
833
  // Repeat.
834
  gsearch.StartFullSearch();
835
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
836
    SmoothNeighbourTypes(bbox, true);
837
  }
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
838
  if ((textord_tabfind_show_strokewidths  && display_if_debugging) ||
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
839
      textord_tabfind_show_strokewidths > 1) {
840
    widths_win_ = DisplayGoodBlobs("ImprovedStrokewidths", 800, 0);
841
  }
842
}
843
844
// Sets the neighbours and good_stroke_neighbours members of the blob by
845
// searching close on all 4 sides.
846
// When finding leader dots/dashes, there is a slightly different rule for
847
// what makes a good neighbour.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
848
void StrokeWidth::SetNeighbours(bool leaders, bool activate_line_trap,
849
                                BLOBNBOX* blob) {
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
850
  int line_trap_count = 0;
851
  for (int dir = 0; dir < BND_COUNT; ++dir) {
852
    BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
853
    line_trap_count += FindGoodNeighbour(bnd, leaders, blob);
854
  }
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
855
  if (line_trap_count > 0 && activate_line_trap) {
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
856
    // It looks like a line so isolate it by clearing its neighbours.
857
    blob->ClearNeighbours();
858
    const TBOX& box = blob->bounding_box();
859
    blob->set_region_type(box.width() > box.height() ? BRT_HLINE : BRT_VLINE);
860
  }
861
}
862
863
864
// Sets the good_stroke_neighbours member of the blob if it has a
865
// GoodNeighbour on the given side.
866
// Also sets the neighbour in the blob, whether or not a good one is found.
867
// Returns the number of blobs in the nearby search area that would lead us to
868
// believe that this blob is a line separator.
869
// Leaders get extra special lenient treatment.
870
int StrokeWidth::FindGoodNeighbour(BlobNeighbourDir dir, bool leaders,
871
                                   BLOBNBOX* blob) {
872
  // Search for neighbours that overlap vertically.
873
  TBOX blob_box = blob->bounding_box();
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
874
  bool debug = AlignedBlob::WithinTestRegion(2, blob_box.left(),
875
                                             blob_box.bottom());
876
  if (debug) {
877
    tprintf("FGN in dir %d for blob:", dir);
878
    blob_box.print();
879
  }
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
880
  int top = blob_box.top();
881
  int bottom = blob_box.bottom();
882
  int left = blob_box.left();
883
  int right = blob_box.right();
884
  int width = right - left;
885
  int height = top - bottom;
886
887
  // A trap to detect lines tests for the min dimension of neighbours
888
  // being larger than a multiple of the min dimension of the line
889
  // and the larger dimension being smaller than a fraction of the max
890
  // dimension of the line.
891
  int line_trap_max = MAX(width, height) / kLineTrapLongest;
892
  int line_trap_min = MIN(width, height) * kLineTrapShortest;
893
  int line_trap_count = 0;
894
895
  int min_good_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
896
                       ? height / 2 : width / 2;
897
  int min_decent_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
898
                       ? height / 3 : width / 3;
899
  if (leaders)
900
    min_good_overlap = min_decent_overlap = 1;
901
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
902
  int search_pad = static_cast<int>(
903
      sqrt(static_cast<double>(width * height)) * kNeighbourSearchFactor);
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
904
  if (gridsize() > search_pad)
905
    search_pad = gridsize();
906
  TBOX search_box = blob_box;
907
  // Pad the search in the appropriate direction.
908
  switch (dir) {
909
  case BND_LEFT:
910
    search_box.set_left(search_box.left() - search_pad);
911
    break;
912
  case BND_RIGHT:
913
    search_box.set_right(search_box.right() + search_pad);
914
    break;
915
  case BND_BELOW:
916
    search_box.set_bottom(search_box.bottom() - search_pad);
917
    break;
918
  case BND_ABOVE:
919
    search_box.set_top(search_box.top() + search_pad);
920
    break;
921
  case BND_COUNT:
922
    return 0;
923
  }
924
925
  BlobGridSearch rectsearch(this);
926
  rectsearch.StartRectSearch(search_box);
927
  BLOBNBOX* best_neighbour = NULL;
928
  double best_goodness = 0.0;
929
  bool best_is_good = false;
930
  BLOBNBOX* neighbour;
931
  while ((neighbour = rectsearch.NextRectSearch()) != NULL) {
932
    TBOX nbox = neighbour->bounding_box();
933
    if (neighbour == blob)
934
      continue;
935
    int mid_x = (nbox.left() + nbox.right()) / 2;
936
    if (mid_x < blob->left_rule() || mid_x > blob->right_rule())
937
      continue;  // In a different column.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
938
    if (debug) {
939
      tprintf("Neighbour at:");
940
      nbox.print();
941
    }
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
942
943
    // Last-minute line detector. There is a small upper limit to the line
944
    // width accepted by the morphological line detector.
945
    int n_width = nbox.width();
946
    int n_height = nbox.height();
947
    if (MIN(n_width, n_height) > line_trap_min &&
948
        MAX(n_width, n_height) < line_trap_max)
949
      ++line_trap_count;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
950
    // Heavily joined text, such as Arabic may have very different sizes when
951
    // looking at the maxes, but the heights may be almost identical, so check
952
    // for a difference in height if looking sideways or width vertically.
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
953
    if (TabFind::VeryDifferentSizes(MAX(n_width, n_height),
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
954
                                    MAX(width, height)) &&
955
        (((dir == BND_LEFT || dir ==BND_RIGHT) &&
956
            TabFind::DifferentSizes(n_height, height)) ||
957
         ((dir == BND_BELOW || dir ==BND_ABOVE) &&
958
             TabFind::DifferentSizes(n_width, width)))) {
959
      if (debug) tprintf("Bad size\n");
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
960
      continue;  // Could be a different font size or non-text.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
961
    }
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
962
    // Amount of vertical overlap between the blobs.
963
    int overlap;
964
    // If the overlap is along the short side of the neighbour, and it
965
    // is fully overlapped, then perp_overlap holds the length of the long
966
    // side of the neighbour. A measure to include hyphens and dashes as
967
    // legitimate neighbours.
968
    int perp_overlap;
969
    int gap;
970
    if (dir == BND_LEFT || dir == BND_RIGHT) {
971
      overlap = MIN(nbox.top(), top) - MAX(nbox.bottom(), bottom);
972
      if (overlap == nbox.height() && nbox.width() > nbox.height())
973
        perp_overlap = nbox.width();
974
      else
975
        perp_overlap = overlap;
976
      gap = dir == BND_LEFT ? left - nbox.left() : nbox.right() - right;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
977
      if (gap <= 0) {
978
        if (debug) tprintf("On wrong side\n");
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
979
        continue;  // On the wrong side.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
980
      }
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
981
      gap -= n_width;
982
    } else {
983
      overlap = MIN(nbox.right(), right) - MAX(nbox.left(), left);
984
      if (overlap == nbox.width() && nbox.height() > nbox.width())
985
        perp_overlap = nbox.height();
986
      else
987
        perp_overlap = overlap;
988
      gap = dir == BND_BELOW ? bottom - nbox.bottom() : nbox.top() - top;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
989
      if (gap <= 0) {
990
        if (debug) tprintf("On wrong side\n");
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
991
        continue;  // On the wrong side.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
992
      }
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
993
      gap -= n_height;
994
    }
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
995
    if (-gap > overlap) {
996
      if (debug) tprintf("Overlaps wrong way\n");
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
997
      continue;  // Overlaps the wrong way.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
998
    }
999
    if (perp_overlap < min_decent_overlap) {
1000
      if (debug) tprintf("Doesn't overlap enough\n");
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
1001
      continue;  // Doesn't overlap enough.
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
1002
    }
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
1003
    bool bad_sizes = TabFind::DifferentSizes(height, n_height) &&
1004
                     TabFind::DifferentSizes(width, n_width);
1005
    bool is_good = overlap >= min_good_overlap && !bad_sizes &&
1006
                   blob->MatchingStrokeWidth(*neighbour,
1007
                                             kStrokeWidthFractionTolerance,
1008
                                             kStrokeWidthTolerance);
1009
    // Best is a fuzzy combination of gap, overlap and is good.
1010
    // Basically if you make one thing twice as good without making
1011
    // anything else twice as bad, then it is better.
1012
    if (gap < 1) gap = 1;
1013
    double goodness = (1.0 + is_good) * overlap / gap;
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
1014
    if (debug) {
1015
      tprintf("goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\n",
1016
              goodness, best_goodness, is_good, overlap, gap);
1017
    }
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
1018
    if (goodness > best_goodness) {
1019
      best_neighbour = neighbour;
1020
      best_goodness = goodness;
1021
      best_is_good = is_good;
1022
    }
1023
  }
1024
  blob->set_neighbour(dir, best_neighbour, best_is_good);
1025
  return line_trap_count;
1026
}
1027
1028
// Helper to get a list of 1st-order neighbours.
1029
static void ListNeighbours(const BLOBNBOX* blob,
1030
                           BLOBNBOX_CLIST* neighbours) {
1031
  for (int dir = 0; dir < BND_COUNT; ++dir) {
1032
    BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1033
    BLOBNBOX* neighbour = blob->neighbour(bnd);
1034
    if (neighbour != NULL) {
1035
      neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
1036
    }
1037
  }
1038
}
1039
1040
// Helper to get a list of 1st and 2nd order neighbours.
1041
static void List2ndNeighbours(const BLOBNBOX* blob,
1042
                              BLOBNBOX_CLIST* neighbours) {
1043
  ListNeighbours(blob, neighbours);
1044
  for (int dir = 0; dir < BND_COUNT; ++dir) {
1045
    BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1046
    BLOBNBOX* neighbour = blob->neighbour(bnd);
1047
    if (neighbour != NULL) {
1048
      ListNeighbours(neighbour, neighbours);
1049
    }
1050
  }
1051
}
1052
1053
// Helper to get a list of 1st, 2nd and 3rd order neighbours.
1054
static void List3rdNeighbours(const BLOBNBOX* blob,
1055
                              BLOBNBOX_CLIST* neighbours) {
1056
  List2ndNeighbours(blob, neighbours);
1057
  for (int dir = 0; dir < BND_COUNT; ++dir) {
1058
    BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
1059
    BLOBNBOX* neighbour = blob->neighbour(bnd);
1060
    if (neighbour != NULL) {
1061
      List2ndNeighbours(neighbour, neighbours);
1062
    }
1063
  }
1064
}
1065
1066
// Helper to count the evidence for verticalness or horizontalness
1067
// in a list of neighbours.
1068
static void CountNeighbourGaps(bool debug, BLOBNBOX_CLIST* neighbours,
1069
                               int* pure_h_count, int* pure_v_count) {
1070
  if (neighbours->length() <= kMostlyOneDirRatio)
1071
    return;
1072
  BLOBNBOX_C_IT it(neighbours);
1073
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1074
    BLOBNBOX* blob = it.data();
1075
    int h_min, h_max, v_min, v_max;
1076
    blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1077
    if (debug)
1078
      tprintf("Hgaps [%d,%d], vgaps [%d,%d]:", h_min, h_max, v_min, v_max);
1079
    if (h_max < v_min ||
1080
        blob->leader_on_left() || blob->leader_on_right()) {
1081
      // Horizontal gaps are clear winners. Count a pure horizontal.
1082
      ++*pure_h_count;
1083
      if (debug) tprintf("Horz at:");
1084
    } else if (v_max < h_min) {
1085
      // Vertical gaps are clear winners. Clear a pure vertical.
1086
      ++*pure_v_count;
1087
      if (debug) tprintf("Vert at:");
1088
    } else {
1089
      if (debug) tprintf("Neither at:");
1090
    }
1091
    if (debug)
1092
      blob->bounding_box().print();
1093
  }
1094
}
1095
1096
// Makes the blob to be only horizontal or vertical where evidence
1097
// is clear based on gaps of 2nd order neighbours, or definite individual
1098
// blobs.
1099
void StrokeWidth::SetNeighbourFlows(BLOBNBOX* blob) {
1100
  if (blob->DefiniteIndividualFlow())
1101
    return;
1102
  bool debug = AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1103
                                             blob->bounding_box().bottom());
1104
  if (debug) {
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
1105
    tprintf("SetNeighbourFlows (current flow=%d, type=%d) on:",
1106
            blob->flow(), blob->region_type());
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
1107
    blob->bounding_box().print();
1108
  }
1109
  BLOBNBOX_CLIST neighbours;
1110
  List3rdNeighbours(blob, &neighbours);
1111
  // The number of pure horizontal and vertical neighbours.
1112
  int pure_h_count = 0;
1113
  int pure_v_count = 0;
1114
  CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count);
1115
  if (debug) {
1116
    HandleClick(blob->bounding_box().left() + 1,
1117
                blob->bounding_box().bottom() + 1);
1118
    tprintf("SetFlows: h_count=%d, v_count=%d\n",
1119
            pure_h_count, pure_v_count);
1120
  }
1121
  if (!neighbours.empty()) {
1122
    blob->set_vert_possible(true);
1123
    blob->set_horz_possible(true);
1124
    if (pure_h_count > 2 * pure_v_count) {
1125
      // Horizontal gaps are clear winners. Clear vertical neighbours.
1126
      blob->set_vert_possible(false);
1127
    } else if (pure_v_count > 2 * pure_h_count) {
1128
      // Vertical gaps are clear winners. Clear horizontal neighbours.
1129
      blob->set_horz_possible(false);
1130
    }
1131
  } else {
1132
    // Lonely blob. Can't tell its flow direction.
1133
    blob->set_vert_possible(false);
1134
    blob->set_horz_possible(false);
1135
  }
1136
}
1137
1138
1139
// Helper to count the number of horizontal and vertical blobs in a list.
1140
static void CountNeighbourTypes(BLOBNBOX_CLIST* neighbours,
1141
                                int* pure_h_count, int* pure_v_count) {
1142
  BLOBNBOX_C_IT it(neighbours);
1143
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1144
    BLOBNBOX* blob = it.data();
1145
    if (blob->UniquelyHorizontal())
1146
      ++*pure_h_count;
1147
    if (blob->UniquelyVertical())
1148
      ++*pure_v_count;
1149
  }
1150
}
1151
1152
// Nullify the neighbours in the wrong directions where the direction
1153
// is clear-cut based on a distance margin. Good for isolating vertical
1154
// text from neighbouring horizontal text.
1155
void StrokeWidth::SimplifyObviousNeighbours(BLOBNBOX* blob) {
391 by theraysmith
Removal of NEWDELETE + fix of problem with joined text
1156
  // Case 1: We have text that is likely several characters, blurry and joined
1157
  //         together.
1158
  if ((blob->bounding_box().width() > 3 * blob->area_stroke_width() &&
1159
       blob->bounding_box().height() > 3 * blob->area_stroke_width())) {
1160
    // The blob is complex (not stick-like).
1161
    if (blob->bounding_box().width() > 4 * blob->bounding_box().height()) {
1162
      // Horizontal conjoined text.
1163
      blob->set_neighbour(BND_ABOVE, NULL, false);
1164
      blob->set_neighbour(BND_BELOW, NULL, false);
1165
      return;
1166
    }
1167
    if (blob->bounding_box().height() > 4 * blob->bounding_box().width()) {
1168
      // Vertical conjoined text.
1169
      blob->set_neighbour(BND_LEFT, NULL, false);
1170
      blob->set_neighbour(BND_RIGHT, NULL, false);
1171
      return;
1172
    }
1173
  }
1174
1175
  // Case 2: This blob is likely a single character.
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
1176
  int margin = gridsize() / 2;
1177
  int h_min, h_max, v_min, v_max;
1178
  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1179
  if ((h_max + margin < v_min && h_max < margin / 2) ||
1180
      blob->leader_on_left() || blob->leader_on_right()) {
1181
    // Horizontal gaps are clear winners. Clear vertical neighbours.
1182
    blob->set_neighbour(BND_ABOVE, NULL, false);
1183
    blob->set_neighbour(BND_BELOW, NULL, false);
1184
  } else if (v_max + margin < h_min && v_max < margin / 2) {
1185
    // Vertical gaps are clear winners. Clear horizontal neighbours.
1186
    blob->set_neighbour(BND_LEFT, NULL, false);
1187
    blob->set_neighbour(BND_RIGHT, NULL, false);
1188
  }
1189
}
1190
1191
// Smoothes the vertical/horizontal type of the blob based on the
1192
// 2nd-order neighbours. If reset_all is true, then all blobs are
1193
// changed. Otherwise, only ambiguous blobs are processed.
1194
void StrokeWidth::SmoothNeighbourTypes(BLOBNBOX* blob, bool reset_all) {
1195
  if ((blob->vert_possible() && blob->horz_possible()) || reset_all) {
1196
    // There are both horizontal and vertical so try to fix it.
1197
    BLOBNBOX_CLIST neighbours;
1198
    List2ndNeighbours(blob, &neighbours);
1199
    // The number of pure horizontal and vertical neighbours.
1200
    int pure_h_count = 0;
1201
    int pure_v_count = 0;
1202
    CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count);
1203
    if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1204
                                      blob->bounding_box().bottom())) {
1205
      HandleClick(blob->bounding_box().left() + 1,
1206
                  blob->bounding_box().bottom() + 1);
1207
      tprintf("pure_h=%d, pure_v=%d\n",
1208
              pure_h_count, pure_v_count);
1209
    }
1210
    if (pure_h_count > pure_v_count) {
1211
      // Horizontal gaps are clear winners. Clear vertical neighbours.
1212
      blob->set_vert_possible(false);
1213
      blob->set_horz_possible(true);
1214
    } else if (pure_v_count > pure_h_count) {
1215
      // Vertical gaps are clear winners. Clear horizontal neighbours.
1216
      blob->set_horz_possible(false);
1217
      blob->set_vert_possible(true);
1218
    }
1219
  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1220
                                    blob->bounding_box().bottom())) {
1221
    HandleClick(blob->bounding_box().left() + 1,
1222
                blob->bounding_box().bottom() + 1);
1223
    tprintf("Clean on pass 3!\n");
1224
  }
1225
}
1226
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
1227
// Partition creation. Accumulates vertical and horizontal text chains,
1228
// puts the remaining blobs in as unknowns, and then merges/splits to
1229
// minimize overlap and smoothes the types with neighbours and the color
1230
// image if provided. rerotation is used to rotate the coordinate space
1231
// back to the nontext_map_ image.
1232
void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation,
1233
                                        TO_BLOCK* block,
1234
                                        ColPartitionGrid* part_grid,
1235
                                        ColPartition_LIST* big_parts) {
1236
  FindVerticalTextChains(part_grid);
1237
  FindHorizontalTextChains(part_grid);
1238
  if (textord_tabfind_show_strokewidths) {
1239
    chains_win_ = MakeWindow(0, 400, "Initial text chains");
1240
    part_grid->DisplayBoxes(chains_win_);
1241
    projection_->DisplayProjection();
1242
  }
1243
  part_grid->SplitOverlappingPartitions(big_parts);
1244
  EasyMerges(part_grid);
1245
  RemoveLargeUnusedBlobs(block, part_grid, big_parts);
1246
  TBOX grid_box(bleft(), tright());
1247
  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1248
                                         rerotation));
1249
  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1250
                                         grid_box, rerotation));
1251
  TestDiacritics(part_grid, block);
1252
  MergeDiacritics(block, part_grid);
1253
  if (textord_tabfind_show_strokewidths) {
1254
    textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs");
1255
    part_grid->DisplayBoxes(textlines_win_);
1256
    diacritics_win_ = DisplayDiacritics("Diacritics", 0, 0, block);
1257
  }
1258
  PartitionRemainingBlobs(part_grid);
1259
  part_grid->SplitOverlappingPartitions(big_parts);
1260
  EasyMerges(part_grid);
1261
  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
1262
                                         rerotation));
1263
  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
1264
                                         grid_box, rerotation));
1265
  // Now eliminate strong stuff in a sea of the opposite.
1266
  while (part_grid->GridSmoothNeighbours(BTFT_STRONG_CHAIN, nontext_map_,
1267
                                         grid_box, rerotation));
1268
  if (textord_tabfind_show_strokewidths) {
1269
    smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs");
1270
    part_grid->DisplayBoxes(smoothed_win_);
1271
  }
1272
}
1273
1274
// Helper verifies that blob's neighbour in direction dir is good to add to a
1275
// vertical text chain by returning the neighbour if it is not null, not owned,
1276
// and not uniquely horizontal, as well as its neighbour in the opposite
1277
// direction is blob.
1278
static BLOBNBOX* MutualUnusedVNeighbour(const BLOBNBOX* blob,
1279
                                        BlobNeighbourDir dir) {
1280
  BLOBNBOX* next_blob = blob->neighbour(dir);
1281
  if (next_blob == NULL || next_blob->owner() != NULL ||
1282
      next_blob->UniquelyHorizontal())
1283
    return NULL;
1284
  if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1285
    return next_blob;
1286
  return NULL;
1287
}
1288
1289
// Finds vertical chains of text-like blobs and puts them in ColPartitions.
1290
void StrokeWidth::FindVerticalTextChains(ColPartitionGrid* part_grid) {
1291
  BlobGridSearch gsearch(this);
1292
  BLOBNBOX* bbox;
1293
  gsearch.StartFullSearch();
1294
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1295
    // Only process boxes that have no horizontal hope and have not yet
1296
    // been included in a chain.
1297
    BLOBNBOX* blob;
1298
    if (bbox->owner() == NULL && bbox->UniquelyVertical() &&
1299
        (blob = MutualUnusedVNeighbour(bbox, BND_ABOVE)) != NULL) {
1300
      // Put all the linked blobs into a ColPartition.
1301
      ColPartition* part = new ColPartition(BRT_VERT_TEXT, ICOORD(0, 1));
1302
      part->AddBox(bbox);
1303
      while (blob != NULL) {
1304
        part->AddBox(blob);
1305
        blob = MutualUnusedVNeighbour(blob, BND_ABOVE);
1306
      }
1307
      blob = MutualUnusedVNeighbour(bbox, BND_BELOW);
1308
      while (blob != NULL) {
1309
        part->AddBox(blob);
1310
        blob = MutualUnusedVNeighbour(blob, BND_BELOW);
1311
      }
1312
      CompletePartition(part, part_grid);
1313
    }
1314
  }
1315
}
1316
1317
// Helper verifies that blob's neighbour in direction dir is good to add to a
1318
// horizontal text chain by returning the neighbour if it is not null, not
1319
// owned, and not uniquely vertical, as well as its neighbour in the opposite
1320
// direction is blob.
1321
static BLOBNBOX* MutualUnusedHNeighbour(const BLOBNBOX* blob,
1322
                                        BlobNeighbourDir dir) {
1323
  BLOBNBOX* next_blob = blob->neighbour(dir);
1324
  if (next_blob == NULL || next_blob->owner() != NULL ||
1325
      next_blob->UniquelyVertical())
1326
    return NULL;
1327
  if (next_blob->neighbour(DirOtherWay(dir)) == blob)
1328
    return next_blob;
1329
  return NULL;
1330
}
1331
1332
// Finds horizontal chains of text-like blobs and puts them in ColPartitions.
1333
void StrokeWidth::FindHorizontalTextChains(ColPartitionGrid* part_grid) {
1334
  BlobGridSearch gsearch(this);
1335
  BLOBNBOX* bbox;
1336
  gsearch.StartFullSearch();
1337
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1338
    BLOBNBOX* blob;
1339
    if (bbox->owner() == NULL && bbox->UniquelyHorizontal() &&
1340
        (blob = MutualUnusedHNeighbour(bbox, BND_RIGHT)) != NULL) {
1341
      // Put all the linked blobs into a ColPartition.
1342
      ColPartition* part = new ColPartition(BRT_TEXT, ICOORD(0, 1));
1343
      part->AddBox(bbox);
1344
      while (blob != NULL) {
1345
        part->AddBox(blob);
1346
        blob = MutualUnusedHNeighbour(blob, BND_RIGHT);
1347
      }
1348
      blob = MutualUnusedHNeighbour(bbox, BND_LEFT);
1349
      while (blob != NULL) {
1350
        part->AddBox(blob);
1351
        blob = MutualUnusedVNeighbour(blob, BND_LEFT);
1352
      }
1353
      CompletePartition(part, part_grid);
1354
    }
1355
  }
1356
}
1357
1358
// Finds diacritics and saves their base character in the blob.
1359
// The objective is to move all diacritics to the noise_blobs list, so
1360
// they don't mess up early textline finding/merging, or force splits
1361
// on textlines that overlap a bit. Blobs that become diacritics must be
1362
// either part of no ColPartition (NULL owner) or in a small partition in
1363
// which ALL the blobs are diacritics, in which case the partition is
1364
// exploded (deleted) back to its blobs.
1365
void StrokeWidth::TestDiacritics(ColPartitionGrid* part_grid, TO_BLOCK* block) {
1366
  BlobGrid small_grid(gridsize(), bleft(), tright());
1367
  small_grid.InsertBlobList(&block->noise_blobs);
1368
  small_grid.InsertBlobList(&block->blobs);
1369
  int medium_diacritics = 0;
1370
  int small_diacritics = 0;
1371
  BLOBNBOX_IT small_it(&block->noise_blobs);
1372
  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1373
    BLOBNBOX* blob = small_it.data();
1374
    if (blob->owner() == NULL && !blob->IsDiacritic() &&
1375
        DiacriticBlob(&small_grid, blob)) {
1376
      ++small_diacritics;
1377
    }
1378
  }
1379
  BLOBNBOX_IT blob_it(&block->blobs);
1380
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1381
    BLOBNBOX* blob = blob_it.data();
1382
    if (blob->IsDiacritic()) {
1383
      small_it.add_to_end(blob_it.extract());
1384
      continue;  // Already a diacritic.
1385
    }
1386
    ColPartition* part = blob->owner();
1387
    if (part == NULL && DiacriticBlob(&small_grid, blob)) {
1388
      ++medium_diacritics;
1389
      RemoveBBox(blob);
1390
      small_it.add_to_end(blob_it.extract());
1391
    } else if (part != NULL && !part->block_owned() &&
1392
        part->boxes_count() < 3) {
1393
      // We allow blobs in small partitions to become diacritics if ALL the
1394
      // blobs in the partition qualify as we can then cleanly delete the
1395
      // partition, turn all the blobs in it to diacritics and they can be
1396
      // merged into the base character partition more easily than merging
1397
      // the partitions.
1398
      BLOBNBOX_C_IT box_it(part->boxes());
1399
      for (box_it.mark_cycle_pt(); !box_it.cycled_list() &&
1400
           DiacriticBlob(&small_grid, box_it.data());
1401
           box_it.forward());
1402
      if (box_it.cycled_list()) {
1403
        // They are all good.
1404
        while (!box_it.empty()) {
1405
          // Liberate the blob from its partition so it can be treated
1406
          // as a diacritic and merged explicitly with the base part.
1407
          // The blob is really owned by the block. The partition "owner"
1408
          // is NULLed to allow the blob to get merged with its base character
1409
          // partition.
1410
          BLOBNBOX* box = box_it.extract();
1411
          box->set_owner(NULL);
1412
          box_it.forward();
1413
          ++medium_diacritics;
1414
          // We remove the blob from the grid so it isn't found by subsequent
1415
          // searches where we might not want to include diacritics.
1416
          RemoveBBox(box);
1417
        }
1418
        // We only move the one blob to the small list here, but the others
1419
        // all get moved by the test at the top of the loop.
1420
        small_it.add_to_end(blob_it.extract());
1421
        part_grid->RemoveBBox(part);
1422
        delete part;
1423
      }
1424
    } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1425
                                             blob->bounding_box().bottom())) {
1426
      tprintf("Blob not available to be a diacritic at:");
1427
      blob->bounding_box().print();
1428
    }
1429
  }
1430
  if (textord_tabfind_show_strokewidths) {
1431
    tprintf("Found %d small diacritics, %d medium\n",
1432
            small_diacritics, medium_diacritics);
1433
  }
1434
}
1435
1436
// Searches this grid for an appropriately close and sized neighbour of the
1437
// given [small] blob. If such a blob is found, the diacritic base is saved
1438
// in the blob and true is returned.
1439
// The small_grid is a secondary grid that contains the small/noise objects
1440
// that are not in this grid, but may be useful for determining a connection
1441
// between blob and its potential base character. (See DiacriticXGapFilled.)
1442
bool StrokeWidth::DiacriticBlob(BlobGrid* small_grid, BLOBNBOX* blob) {
1443
  if (BLOBNBOX::UnMergeableType(blob->region_type()) ||
1444
      blob->region_type() == BRT_VERT_TEXT)
1445
    return false;
1446
  TBOX small_box(blob->bounding_box());
1447
  bool debug = AlignedBlob::WithinTestRegion(2, small_box.left(),
1448
                                             small_box.bottom());
1449
  if (debug) {
1450
    tprintf("Testing blob for diacriticness at:");
1451
    small_box.print();
1452
  }
1453
  int x = (small_box.left() + small_box.right()) / 2;
1454
  int y = (small_box.bottom() + small_box.top()) / 2;
1455
  int grid_x, grid_y;
1456
  GridCoords(x, y, &grid_x, &grid_y);
1457
  int height = small_box.height();
1458
  // Setup a rectangle search to find its nearest base-character neighbour.
1459
  // We keep 2 different best candidates:
1460
  // best_x_overlap is a category of base characters that have an overlap in x
1461
  // (like a acute) in which we look for the least y-gap, computed using the
1462
  // projection to favor base characters in the same textline.
1463
  // best_y_overlap is a category of base characters that have no x overlap,
1464
  // (nominally a y-overlap is preferrecd but not essential) in which we
1465
  // look for the least weighted sum of x-gap and y-gap, with x-gap getting
1466
  // a lower weight to catch quotes at the end of a textline.
1467
  // NOTE that x-gap and y-gap are measured from the nearest side of the base
1468
  // character to the FARTHEST side of the diacritic to allow small diacritics
1469
  // to be a reasonable distance away, but not big diacritics.
1470
  BLOBNBOX* best_x_overlap = NULL;
1471
  BLOBNBOX* best_y_overlap = NULL;
1472
  int best_total_dist = 0;
1473
  int best_y_gap = 0;
1474
  TBOX best_xbox;
1475
  // TODO(rays) the search box could be setup using the projection as a guide.
1476
  TBOX search_box(small_box);
1477
  int x_pad = IntCastRounded(gridsize() * kDiacriticXPadRatio);
1478
  int y_pad = IntCastRounded(gridsize() * kDiacriticYPadRatio);
1479
  search_box.pad(x_pad, y_pad);
1480
  BlobGridSearch rsearch(this);
1481
  rsearch.SetUniqueMode(true);
1482
  int min_height = height * kMinDiacriticSizeRatio;
1483
  rsearch.StartRectSearch(search_box);
1484
  BLOBNBOX* neighbour;
1485
  while ((neighbour = rsearch.NextRectSearch()) != NULL) {
1486
    if (BLOBNBOX::UnMergeableType(neighbour->region_type()) ||
1487
        neighbour == blob || neighbour->owner() == blob->owner())
1488
      continue;
1489
    TBOX nbox = neighbour->bounding_box();
1490
    if (neighbour->owner() == NULL || neighbour->owner()->IsVerticalType() ||
1491
        (neighbour->flow() != BTFT_CHAIN &&
1492
            neighbour->flow() != BTFT_STRONG_CHAIN)) {
1493
      if (debug) {
1494
        tprintf("Neighbour not strong enough:");
1495
        nbox.print();
1496
      }
1497
      continue;  // Diacritics must be attached to strong text.
1498
    }
1499
    if (nbox.height() < min_height) {
1500
      if (debug) {
1501
        tprintf("Neighbour not big enough:");
1502
        nbox.print();
1503
      }
1504
      continue;  // Too small to be the base character.
1505
    }
1506
    int x_gap = small_box.x_gap(nbox);
1507
    int y_gap = small_box.y_gap(nbox);
1508
    int total_distance = projection_->DistanceOfBoxFromBox(small_box, nbox,
1509
                                                           true, denorm_,
1510
                                                           debug);
1511
    if (debug) tprintf("xgap=%d, y=%d, total dist=%d\n",
1512
                       x_gap, y_gap, total_distance);
1513
    if (total_distance >
1514
        neighbour->owner()->median_size() * kMaxDiacriticDistanceRatio) {
1515
      if (debug) {
1516
        tprintf("Neighbour with median size %d too far away:",
1517
                neighbour->owner()->median_size());
1518
        neighbour->bounding_box().print();
1519
      }
1520
      continue;  // Diacritics must not be too distant.
1521
    }
1522
    if (x_gap <= 0) {
1523
      if (debug) {
1524
        tprintf("Computing reduced box for :");
1525
        nbox.print();
1526
      }
1527
      int left = small_box.left() - small_box.width();
1528
      int right = small_box.right() + small_box.width();
1529
      nbox = neighbour->BoundsWithinLimits(left, right);
1530
      y_gap = small_box.y_gap(nbox);
1531
      if (best_x_overlap == NULL || y_gap < best_y_gap) {
1532
        best_x_overlap = neighbour;
1533
        best_xbox = nbox;
1534
        best_y_gap = y_gap;
1535
        if (debug) {
1536
          tprintf("New best:");
1537
          nbox.print();
1538
        }
1539
      } else if (debug) {
1540
        tprintf("Shrunken box doesn't win:");
1541
        nbox.print();
1542
      }
1543
    } else if (blob->ConfirmNoTabViolation(*neighbour)) {
1544
      if (best_y_overlap == NULL || total_distance < best_total_dist) {
1545
        if (debug) {
1546
          tprintf("New best y overlap:");
1547
          nbox.print();
1548
        }
1549
        best_y_overlap = neighbour;
1550
        best_total_dist = total_distance;
1551
      } else if (debug) {
1552
        tprintf("New y overlap box doesn't win:");
1553
        nbox.print();
1554
      }
1555
    } else if (debug) {
1556
      tprintf("Neighbour wrong side of a tab:");
1557
      nbox.print();
1558
    }
1559
  }
1560
  if (best_x_overlap != NULL &&
1561
      (best_y_overlap == NULL ||
1562
       best_xbox.major_y_overlap(best_y_overlap->bounding_box()))) {
1563
    blob->set_diacritic_box(best_xbox);
1564
    blob->set_base_char_blob(best_x_overlap);
1565
    if (debug) {
1566
      tprintf("DiacriticBlob OK! (x-overlap:");
1567
      small_box.print();
1568
      best_xbox.print();
1569
    }
1570
    return true;
1571
  }
1572
  if (best_y_overlap != NULL &&
1573
      DiacriticXGapFilled(small_grid, small_box,
1574
                          best_y_overlap->bounding_box()) &&
1575
      NoNoiseInBetween(small_box, best_y_overlap->bounding_box())) {
1576
    blob->set_diacritic_box(best_y_overlap->bounding_box());
1577
    blob->set_base_char_blob(best_y_overlap);
1578
    if (debug) {
1579
      tprintf("DiacriticBlob OK! (y-overlap:");
1580
      small_box.print();
1581
      best_y_overlap->bounding_box().print();
1582
    }
1583
    return true;
1584
  }
1585
  if (debug) {
1586
    tprintf("DiacriticBlob fails:");
1587
    small_box.print();
1588
    tprintf("Best x+y gap = %d, y = %d\n", best_total_dist, best_y_gap);
1589
    if (best_y_overlap != NULL) {
1590
      tprintf("XGapFilled=%d, NoiseBetween=%d\n",
1591
              DiacriticXGapFilled(small_grid, small_box,
1592
                                  best_y_overlap->bounding_box()),
1593
              NoNoiseInBetween(small_box, best_y_overlap->bounding_box()));
1594
    }
1595
  }
1596
  return false;
1597
}
1598
1599
// Returns true if there is no gap between the base char and the diacritic
1600
// bigger than a fraction of the height of the base char:
1601
// Eg: line end.....'
1602
// The quote is a long way from the end of the line, yet it needs to be a
1603
// diacritic. To determine that the quote is not part of an image, or
1604
// a different text block, we check for other marks in the gap between
1605
// the base char and the diacritic.
1606
//                          '<--Diacritic
1607
// |---------|
1608
// |         |<-toobig-gap->
1609
// | Base    |<ok gap>
1610
// |---------|        x<-----Dot occupying gap
1611
// The grid is const really.
1612
bool StrokeWidth::DiacriticXGapFilled(BlobGrid* grid,
1613
                                      const TBOX& diacritic_box,
1614
                                      const TBOX& base_box) {
1615
  // Since most gaps are small, use an iterative algorithm to search the gap.
1616
  int max_gap = IntCastRounded(base_box.height() *
1617
                               kMaxDiacriticGapToBaseCharHeight);
1618
  TBOX occupied_box(base_box);
1619
  int diacritic_gap;
1620
  while ((diacritic_gap = diacritic_box.x_gap(occupied_box)) > max_gap) {
1621
    TBOX search_box(occupied_box);
1622
    if (diacritic_box.left() > search_box.right()) {
1623
      // We are looking right.
1624
      search_box.set_left(search_box.right());
1625
      search_box.set_right(search_box.left() + max_gap);
1626
    } else {
1627
      // We are looking left.
1628
      search_box.set_right(search_box.left());
1629
      search_box.set_left(search_box.left() - max_gap);
1630
    }
1631
    BlobGridSearch rsearch(grid);
1632
    rsearch.StartRectSearch(search_box);
1633
    BLOBNBOX* neighbour;
1634
    while ((neighbour = rsearch.NextRectSearch()) != NULL) {
1635
      const TBOX& nbox = neighbour->bounding_box();
1636
      if (nbox.x_gap(diacritic_box) < diacritic_gap) {
1637
        if (nbox.left() < occupied_box.left())
1638
          occupied_box.set_left(nbox.left());
1639
        if (nbox.right() > occupied_box.right())
1640
          occupied_box.set_right(nbox.right());
1641
        break;
1642
      }
1643
    }
1644
    if (neighbour == NULL)
1645
      return false;  // Found a big gap.
1646
  }
1647
  return true;  // The gap was filled.
1648
}
1649
1650
// Merges diacritics with the ColPartition of the base character blob.
1651
void StrokeWidth::MergeDiacritics(TO_BLOCK* block,
1652
                                  ColPartitionGrid* part_grid) {
1653
  BLOBNBOX_IT small_it(&block->noise_blobs);
1654
  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1655
    BLOBNBOX* blob = small_it.data();
1656
    if (blob->base_char_blob() != NULL) {
1657
      ColPartition* part = blob->base_char_blob()->owner();
1658
      // The base character must be owned by a partition and that partition
1659
      // must not be on the big_parts list (not block owned).
1660
      if (part != NULL && !part->block_owned() && blob->owner() == NULL &&
1661
          blob->IsDiacritic()) {
1662
        // The partition has to be removed from the grid and reinserted
1663
        // because its bounding box may change.
1664
        part_grid->RemoveBBox(part);
1665
        part->AddBox(blob);
1666
        blob->set_region_type(part->blob_type());
1667
        blob->set_flow(part->flow());
1668
        blob->set_owner(part);
1669
        part_grid->InsertBBox(true, true, part);
1670
      }
1671
      // Set all base chars to NULL before any blobs get deleted.
1672
      blob->set_base_char_blob(NULL);
1673
    }
1674
  }
1675
}
1676
1677
// Any blobs on the large_blobs list of block that are still unowned by a
1678
// ColPartition, are probably drop-cap or vertically touching so the blobs
1679
// are removed to the big_parts list and treated separately.
1680
void StrokeWidth::RemoveLargeUnusedBlobs(TO_BLOCK* block,
1681
                                         ColPartitionGrid* part_grid,
1682
                                         ColPartition_LIST* big_parts) {
1683
  BLOBNBOX_IT large_it(&block->large_blobs);
199 by theraysmith
Changes to textord for 3.00
1684
  for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
1685
    BLOBNBOX* blob = large_it.data();
1686
    ColPartition* big_part = blob->owner();
1687
    if (big_part == NULL) {
1688
      // Large blobs should have gone into partitions by now if they are
1689
      // genuine characters, so move any unowned ones out to the big parts
1690
      // list. This will include drop caps and vertically touching characters.
1691
      ColPartition::MakeBigPartition(blob, big_parts);
1692
    }
1693
  }
1694
}
1695
1696
// All remaining unused blobs are put in individual ColPartitions.
1697
void StrokeWidth::PartitionRemainingBlobs(ColPartitionGrid* part_grid) {
1698
  BlobGridSearch gsearch(this);
1699
  BLOBNBOX* bbox;
1700
  int prev_grid_x = -1;
1701
  int prev_grid_y = -1;
1702
  BLOBNBOX_CLIST cell_list;
1703
  BLOBNBOX_C_IT cell_it(&cell_list);
1704
  bool cell_all_noise = true;
1705
  gsearch.StartFullSearch();
1706
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1707
    int grid_x = gsearch.GridX();
1708
    int grid_y = gsearch.GridY();
1709
    if (grid_x != prev_grid_x || grid_y != prev_grid_y) {
1710
      // New cell. Process old cell.
1711
      MakePartitionsFromCellList(cell_all_noise, part_grid, &cell_list);
1712
      cell_it.set_to_list(&cell_list);
1713
      prev_grid_x = grid_x;
1714
      prev_grid_y = grid_y;
1715
      cell_all_noise = true;
1716
    }
1717
    if (bbox->owner() == NULL) {
1718
      cell_it.add_to_end(bbox);
1719
      if (bbox->flow() != BTFT_NONTEXT)
1720
        cell_all_noise = false;
1721
    } else {
1722
      cell_all_noise = false;
1723
    }
1724
  }
1725
  MakePartitionsFromCellList(cell_all_noise, part_grid, &cell_list);
1726
}
1727
1728
// If combine, put all blobs in the cell_list into a single partition, otherwise
1729
// put each one into its own partition.
1730
void StrokeWidth::MakePartitionsFromCellList(bool combine,
1731
                                             ColPartitionGrid* part_grid,
1732
                                             BLOBNBOX_CLIST* cell_list) {
1733
  if (cell_list->empty())
1734
    return;
1735
  BLOBNBOX_C_IT cell_it(cell_list);
1736
  if (combine) {
1737
    BLOBNBOX* bbox = cell_it.extract();
1738
    ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1739
    part->AddBox(bbox);
1740
    part->set_flow(bbox->flow());
1741
    for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) {
1742
      part->AddBox(cell_it.extract());
1743
    }
1744
    CompletePartition(part, part_grid);
1745
  } else {
1746
    for (; !cell_it.empty(); cell_it.forward()) {
1747
      BLOBNBOX* bbox = cell_it.extract();
1748
      ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1749
      part->set_flow(bbox->flow());
1750
      part->AddBox(bbox);
1751
      CompletePartition(part, part_grid);
1752
    }
1753
  }
1754
}
1755
1756
// Helper function to finish setting up a ColPartition and insert into
1757
// part_grid.
1758
void StrokeWidth::CompletePartition(ColPartition* part,
1759
                                    ColPartitionGrid* part_grid) {
1760
  part->ComputeLimits();
1761
  TBOX box = part->bounding_box();
1762
  bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
1763
                                             box.bottom());
1764
  int value = projection_->EvaluateColPartition(*part, denorm_, debug);
1765
  part->SetRegionAndFlowTypesFromProjectionValue(value);
1766
  part->ClaimBoxes();
1767
  part_grid->InsertBBox(true, true, part);
1768
}
1769
1770
// Merge partitions where the merge appears harmless.
1771
// As this
1772
void StrokeWidth::EasyMerges(ColPartitionGrid* part_grid) {
1773
  part_grid->Merges(
1774
      NewPermanentTessCallback(this, &StrokeWidth::OrientationSearchBox),
1775
      NewPermanentTessCallback(this, &StrokeWidth::ConfirmEasyMerge));
1776
}
1777
1778
// Compute a search box based on the orientation of the partition.
1779
// Returns true if a suitable box can be calculated.
1780
// Callback for EasyMerges.
1781
bool StrokeWidth::OrientationSearchBox(ColPartition* part, TBOX* box) {
1782
  if (part->IsVerticalType()) {
1783
    box->set_top(box->top() + box->width());
1784
    box->set_bottom(box->bottom() - box->width());
1785
  } else {
1786
    box->set_left(box->left() - box->height());
1787
    box->set_right(box->right() + box->height());
1788
  }
1789
  return true;
1790
}
1791
1792
// Merge confirmation callback for EasyMerges.
1793
bool StrokeWidth::ConfirmEasyMerge(const ColPartition* p1,
1794
                                   const ColPartition* p2) {
1795
  ASSERT_HOST(p1 != NULL && p2 != NULL);
1796
  ASSERT_HOST(!p1->IsEmpty() && !p2->IsEmpty());
1797
  if ((p1->flow() == BTFT_NONTEXT && p2->flow() >= BTFT_CHAIN) ||
1798
      (p1->flow() >= BTFT_CHAIN && p2->flow() == BTFT_NONTEXT))
1799
    return false;  // Don't merge confirmed image with text.
1800
  if ((p1->IsVerticalType() || p2->IsVerticalType()) &&
1801
       p1->HCoreOverlap(*p2) <= 0 &&
1802
       ((!p1->IsSingleton() &&
1803
         !p2->IsSingleton()) ||
1804
        !p1->bounding_box().major_overlap(p2->bounding_box())))
1805
    return false;  // Overlap must be in the text line.
1806
  if ((p1->IsHorizontalType() || p2->IsHorizontalType()) &&
1807
      p1->VCoreOverlap(*p2) <= 0 &&
1808
      ((!p1->IsSingleton() &&
1809
        !p2->IsSingleton()) ||
1810
       (!p1->bounding_box().major_overlap(p2->bounding_box()) &&
1811
        !p1->OKDiacriticMerge(*p2, false) &&
1812
        !p2->OKDiacriticMerge(*p1, false))))
1813
    return false;  // Overlap must be in the text line.
1814
  if (!p1->ConfirmNoTabViolation(*p2))
1815
    return false;
1816
  if (p1->flow() <= BTFT_NONTEXT && p2->flow() <= BTFT_NONTEXT)
1817
    return true;
1818
  return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box());
1819
}
1820
1821
// Returns true if there is no significant noise in between the boxes.
1822
bool StrokeWidth::NoNoiseInBetween(const TBOX& box1, const TBOX& box2) const {
1823
  return ImageFind::BlankImageInBetween(box1, box2, grid_box_, rerotation_,
1824
                                        nontext_map_);
199 by theraysmith
Changes to textord for 3.00
1825
}
1826
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
1827
/** Displays the blobs colored according to the number of good neighbours
1828
 * and the vertical/horizontal flow.
1829
 */
199 by theraysmith
Changes to textord for 3.00
1830
ScrollView* StrokeWidth::DisplayGoodBlobs(const char* window_name,
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
1831
                                          int x, int y) {
1832
  ScrollView* window = NULL;
199 by theraysmith
Changes to textord for 3.00
1833
#ifndef GRAPHICS_DISABLED
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
1834
  window = MakeWindow(x, y, window_name);
199 by theraysmith
Changes to textord for 3.00
1835
  // For every blob in the grid, display it.
1836
  window->Brush(ScrollView::NONE);
1837
1838
  // For every bbox in the grid, display it.
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
1839
  BlobGridSearch gsearch(this);
199 by theraysmith
Changes to textord for 3.00
1840
  gsearch.StartFullSearch();
1841
  BLOBNBOX* bbox;
1842
  while ((bbox = gsearch.NextFullSearch()) != NULL) {
1843
    TBOX box = bbox->bounding_box();
1844
    int left_x = box.left();
1845
    int right_x = box.right();
1846
    int top_y = box.top();
1847
    int bottom_y = box.bottom();
381 by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process
1848
    int goodness = bbox->GoodTextBlob();
1849
    BlobRegionType blob_type = bbox->region_type();
1850
    if (bbox->UniquelyVertical())
1851
      blob_type = BRT_VERT_TEXT;
1852
    if (bbox->UniquelyHorizontal())
1853
      blob_type = BRT_TEXT;
1854
    BlobTextFlowType flow = bbox->flow();
1855
    if (flow == BTFT_NONE) {
1856
      if (goodness == 0)
1857
        flow = BTFT_NEIGHBOURS;
1858
      else if (goodness == 1)
1859
        flow = BTFT_CHAIN;
1860
      else
1861
        flow = BTFT_STRONG_CHAIN;
1862
    }
1863
    window->Pen(BLOBNBOX::TextlineColor(blob_type, flow));
199 by theraysmith
Changes to textord for 3.00
1864
    window->Rectangle(left_x, bottom_y, right_x, top_y);
1865
  }
1866
  window->Update();
1867
#endif
1868
  return window;
1869
}
1870
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
1871
static void DrawDiacriticJoiner(const BLOBNBOX* blob, ScrollView* window) {
751 by zdenop
fix build with -DGRAPHICS_DISABLED
1872
#ifndef GRAPHICS_DISABLED
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
1873
  const TBOX& blob_box(blob->bounding_box());
1874
  int top = MAX(blob_box.top(), blob->base_char_top());
1875
  int bottom = MIN(blob_box.bottom(), blob->base_char_bottom());
1876
  int x = (blob_box.left() + blob_box.right()) / 2;
1877
  window->Line(x, top, x, bottom);
751 by zdenop
fix build with -DGRAPHICS_DISABLED
1878
#endif  // GRAPHICS_DISABLED
482 by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding
1879
}
1880
1881
// Displays blobs colored according to whether or not they are diacritics.
1882
ScrollView* StrokeWidth::DisplayDiacritics(const char* window_name,
1883
                                           int x, int y, TO_BLOCK* block) {
1884
  ScrollView* window = NULL;
1885
#ifndef GRAPHICS_DISABLED
1886
  window = MakeWindow(x, y, window_name);
1887
  // For every blob in the grid, display it.
1888
  window->Brush(ScrollView::NONE);
1889
1890
  BLOBNBOX_IT it(&block->blobs);
1891
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1892
    BLOBNBOX* blob = it.data();
1893
    if (blob->IsDiacritic()) {
1894
      window->Pen(ScrollView::GREEN);
1895
      DrawDiacriticJoiner(blob, window);
1896
    } else {
1897
      window->Pen(blob->BoxColor());
1898
    }
1899
    const TBOX& box = blob->bounding_box();
1900
    window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
1901
  }
1902
  it.set_to_list(&block->noise_blobs);
1903
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1904
    BLOBNBOX* blob = it.data();
1905
    if (blob->IsDiacritic()) {
1906
      window->Pen(ScrollView::GREEN);
1907
      DrawDiacriticJoiner(blob, window);
1908
    } else {
1909
      window->Pen(ScrollView::WHITE);
1910
    }
1911
    const TBOX& box = blob->bounding_box();
1912
    window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
1913
  }
1914
  window->Update();
1915
#endif
1916
  return window;
199 by theraysmith
Changes to textord for 3.00
1917
}
1918
1919
}  // namespace tesseract.