199
by theraysmith
Changes to textord for 3.00 |
1 |
///////////////////////////////////////////////////////////////////////
|
2 |
// File: strokewidth.cpp
|
|
3 |
// Description: Subclass of BBGrid to find uniformity of strokewidth.
|
|
4 |
// Author: Ray Smith
|
|
5 |
// Created: Mon Mar 31 16:17:01 PST 2008
|
|
6 |
//
|
|
7 |
// (C) Copyright 2008, Google Inc.
|
|
8 |
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
9 |
// you may not use this file except in compliance with the License.
|
|
10 |
// You may obtain a copy of the License at
|
|
11 |
// http://www.apache.org/licenses/LICENSE-2.0
|
|
12 |
// Unless required by applicable law or agreed to in writing, software
|
|
13 |
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
14 |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15 |
// See the License for the specific language governing permissions and
|
|
16 |
// limitations under the License.
|
|
17 |
//
|
|
18 |
///////////////////////////////////////////////////////////////////////
|
|
19 |
||
247
by joregan
disable MSVC warning C4244 in a number of places to cut down the noise |
20 |
#ifdef _MSC_VER
|
21 |
#pragma warning(disable:4244) // Conversion warnings |
|
22 |
#endif
|
|
23 |
||
751
by zdenop
fix build with -DGRAPHICS_DISABLED |
24 |
#ifdef HAVE_CONFIG_H
|
25 |
#include "config_auto.h" |
|
26 |
#endif
|
|
27 |
||
199
by theraysmith
Changes to textord for 3.00 |
28 |
#include "strokewidth.h" |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
29 |
|
30 |
#include <math.h> |
|
31 |
||
199
by theraysmith
Changes to textord for 3.00 |
32 |
#include "blobbox.h" |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
33 |
#include "colpartition.h" |
34 |
#include "colpartitiongrid.h" |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
35 |
#include "imagefind.h" |
36 |
#include "linlsq.h" |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
37 |
#include "statistc.h" |
199
by theraysmith
Changes to textord for 3.00 |
38 |
#include "tabfind.h" |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
39 |
#include "textlineprojection.h" |
199
by theraysmith
Changes to textord for 3.00 |
40 |
#include "tordmain.h" // For SetBlobStrokeWidth. |
41 |
||
42 |
namespace tesseract { |
|
43 |
||
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
44 |
INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths"); |
45 |
BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths"); |
|
46 |
BOOL_VAR(textord_tabfind_vertical_text, true, "Enable vertical detection"); |
|
47 |
BOOL_VAR(textord_tabfind_force_vertical_text, false, |
|
48 |
"Force using vertical text page mode"); |
|
49 |
BOOL_VAR(textord_tabfind_vertical_horizontal_mix, true, |
|
50 |
"find horizontal lines such as headers in vertical page mode"); |
|
51 |
double_VAR(textord_tabfind_vertical_text_ratio, 0.5, |
|
52 |
"Fraction of textlines deemed vertical to use vertical page mode"); |
|
53 |
||
314
by joregan
more doxygen |
54 |
/** Allowed proportional change in stroke width to be the same font. */
|
199
by theraysmith
Changes to textord for 3.00 |
55 |
const double kStrokeWidthFractionTolerance = 0.125; |
314
by joregan
more doxygen |
56 |
/**
|
57 |
* Allowed constant change in stroke width to be the same font.
|
|
58 |
* Really 1.5 pixels.
|
|
59 |
*/
|
|
199
by theraysmith
Changes to textord for 3.00 |
60 |
const double kStrokeWidthTolerance = 1.5; |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
61 |
// Same but for CJK we are a bit more generous.
|
62 |
const double kStrokeWidthFractionCJK = 0.25; |
|
63 |
const double kStrokeWidthCJK = 2.0; |
|
64 |
// Radius in grid cells of search for broken CJK. Doesn't need to be very
|
|
65 |
// large as the grid size should be about the size of a character anyway.
|
|
66 |
const int kCJKRadius = 2; |
|
67 |
// Max distance fraction of size to join close but broken CJK characters.
|
|
68 |
const double kCJKBrokenDistanceFraction = 0.25; |
|
69 |
// Max number of components in a broken CJK character.
|
|
70 |
const int kCJKMaxComponents = 8; |
|
71 |
// Max aspect ratio of CJK broken characters when put back together.
|
|
72 |
const double kCJKAspectRatio = 1.25; |
|
73 |
// Max increase in aspect ratio of CJK broken characters when merged.
|
|
74 |
const double kCJKAspectRatioIncrease = 1.0625; |
|
75 |
// Max multiple of the grid size that will be used in computing median CJKsize.
|
|
76 |
const int kMaxCJKSizeRatio = 5; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
77 |
// Min fraction of blobs broken CJK to iterate and run it again.
|
78 |
const double kBrokenCJKIterationFraction = 0.125; |
|
79 |
// Multiple of gridsize as x-padding for a search box for diacritic base
|
|
80 |
// characters.
|
|
81 |
const double kDiacriticXPadRatio = 7.0; |
|
82 |
// Multiple of gridsize as y-padding for a search box for diacritic base
|
|
83 |
// characters.
|
|
84 |
const double kDiacriticYPadRatio = 1.75; |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
85 |
// Min multiple of diacritic height that a neighbour must be to be a
|
86 |
// convincing base character.
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
87 |
const double kMinDiacriticSizeRatio = 1.0625; |
88 |
// Max multiple of a textline's median height as a threshold for the sum of
|
|
89 |
// a diacritic's farthest x and y distances (gap + size).
|
|
90 |
const double kMaxDiacriticDistanceRatio = 1.25; |
|
91 |
// Max x-gap between a diacritic and its base char as a fraction of the height
|
|
92 |
// of the base char (allowing other blobs to fill the gap.)
|
|
93 |
const double kMaxDiacriticGapToBaseCharHeight = 1.0; |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
94 |
// Radius of a search for diacritics in grid units.
|
95 |
const int kSearchRadius = 2; |
|
96 |
// Ratio between longest side of a line and longest side of a character.
|
|
97 |
// (neighbor_min > blob_min * kLineTrapShortest &&
|
|
98 |
// neighbor_max < blob_max / kLineTrapLongest)
|
|
99 |
// => neighbor is a grapheme and blob is a line.
|
|
100 |
const int kLineTrapLongest = 4; |
|
101 |
// Ratio between shortest side of a line and shortest side of a character.
|
|
102 |
const int kLineTrapShortest = 2; |
|
103 |
// Max aspect ratio of the total box before CountNeighbourGaps
|
|
104 |
// decides immediately based on the aspect ratio.
|
|
105 |
const int kMostlyOneDirRatio = 3; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
106 |
// Aspect ratio for a blob to be considered as line residue.
|
107 |
const double kLineResidueAspectRatio = 8.0; |
|
108 |
// Padding ratio for line residue search box.
|
|
109 |
const int kLineResiduePadRatio = 3; |
|
110 |
// Min multiple of neighbour size for a line residue to be genuine.
|
|
111 |
const double kLineResidueSizeRatio = 1.75; |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
112 |
// Aspect ratio filter for OSD.
|
113 |
const float kSizeRatioToReject = 2.0; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
114 |
// Max number of normal blobs a large blob may overlap before it is rejected
|
115 |
// and determined to be image
|
|
116 |
const int kMaxLargeOverlaps = 3; |
|
117 |
// Expansion factor for search box for good neighbours.
|
|
118 |
const double kNeighbourSearchFactor = 2.5; |
|
199
by theraysmith
Changes to textord for 3.00 |
119 |
|
120 |
StrokeWidth::StrokeWidth(int gridsize, |
|
121 |
const ICOORD& bleft, const ICOORD& tright) |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
122 |
: BlobGrid(gridsize, bleft, tright), nontext_map_(NULL), projection_(NULL), |
123 |
denorm_(NULL), grid_box_(bleft, tright), rerotation_(1.0f, 0.0f) { |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
124 |
leaders_win_ = NULL; |
125 |
widths_win_ = NULL; |
|
126 |
initial_widths_win_ = NULL; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
127 |
chains_win_ = NULL; |
128 |
diacritics_win_ = NULL; |
|
129 |
textlines_win_ = NULL; |
|
130 |
smoothed_win_ = NULL; |
|
199
by theraysmith
Changes to textord for 3.00 |
131 |
}
|
132 |
||
133 |
StrokeWidth::~StrokeWidth() { |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
134 |
if (widths_win_ != NULL) { |
519
by zdenop at gmail
fix for GRAPHICS_DISABLED build |
135 |
#ifndef GRAPHICS_DISABLED
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
136 |
delete widths_win_->AwaitEvent(SVET_DESTROY); |
519
by zdenop at gmail
fix for GRAPHICS_DISABLED build |
137 |
#endif // GRAPHICS_DISABLED |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
138 |
if (textord_tabfind_only_strokewidths) |
139 |
exit(0); |
|
140 |
delete widths_win_; |
|
141 |
}
|
|
142 |
delete leaders_win_; |
|
143 |
delete initial_widths_win_; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
144 |
delete chains_win_; |
145 |
delete textlines_win_; |
|
146 |
delete smoothed_win_; |
|
147 |
delete diacritics_win_; |
|
148 |
}
|
|
149 |
||
150 |
// Sets the neighbours member of the medium-sized blobs in the block.
|
|
151 |
// Searches on 4 sides of each blob for similar-sized, similar-strokewidth
|
|
152 |
// blobs and sets pointers to the good neighbours.
|
|
153 |
void StrokeWidth::SetNeighboursOnMediumBlobs(TO_BLOCK* block) { |
|
154 |
// Run a preliminary strokewidth neighbour detection on the medium blobs.
|
|
155 |
InsertBlobList(&block->blobs); |
|
156 |
BLOBNBOX_IT blob_it(&block->blobs); |
|
157 |
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
|
158 |
SetNeighbours(false, false, blob_it.data()); |
|
159 |
}
|
|
160 |
Clear(); |
|
161 |
}
|
|
162 |
||
163 |
// Sets the neighbour/textline writing direction members of the medium
|
|
164 |
// and large blobs with optional repair of broken CJK characters first.
|
|
165 |
// Repair of broken CJK is needed here because broken CJK characters
|
|
166 |
// can fool the textline direction detection algorithm.
|
|
167 |
void StrokeWidth::FindTextlineDirectionAndFixBrokenCJK(bool cjk_merge, |
|
168 |
TO_BLOCK* input_block) { |
|
169 |
// Setup the grid with the remaining (non-noise) blobs.
|
|
170 |
InsertBlobs(input_block); |
|
171 |
// Repair broken CJK characters if needed.
|
|
172 |
while (cjk_merge && FixBrokenCJK(input_block)); |
|
173 |
// Grade blobs by inspection of neighbours.
|
|
174 |
FindTextlineFlowDirection(false); |
|
175 |
// Clear the grid ready for rotation or leader finding.
|
|
176 |
Clear(); |
|
177 |
}
|
|
178 |
||
179 |
// Helper to collect and count horizontal and vertical blobs from a list.
|
|
180 |
static void CollectHorizVertBlobs(BLOBNBOX_LIST* input_blobs, |
|
181 |
int* num_vertical_blobs, |
|
182 |
int* num_horizontal_blobs, |
|
183 |
BLOBNBOX_CLIST* vertical_blobs, |
|
184 |
BLOBNBOX_CLIST* horizontal_blobs, |
|
185 |
BLOBNBOX_CLIST* nondescript_blobs) { |
|
186 |
BLOBNBOX_C_IT v_it(vertical_blobs); |
|
187 |
BLOBNBOX_C_IT h_it(horizontal_blobs); |
|
188 |
BLOBNBOX_C_IT n_it(nondescript_blobs); |
|
189 |
BLOBNBOX_IT blob_it(input_blobs); |
|
190 |
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
|
191 |
BLOBNBOX* blob = blob_it.data(); |
|
192 |
const TBOX& box = blob->bounding_box(); |
|
193 |
float y_x = static_cast<float>(box.height()) / box.width(); |
|
194 |
float x_y = 1.0f / y_x; |
|
195 |
// Select a >= 1.0 ratio
|
|
196 |
float ratio = x_y > y_x ? x_y : y_x; |
|
197 |
// If the aspect ratio is small and we want them for osd, save the blob.
|
|
198 |
bool ok_blob = ratio <= kSizeRatioToReject; |
|
199 |
if (blob->UniquelyVertical()) { |
|
200 |
++*num_vertical_blobs; |
|
201 |
if (ok_blob) v_it.add_after_then_move(blob); |
|
202 |
} else if (blob->UniquelyHorizontal()) { |
|
203 |
++*num_horizontal_blobs; |
|
204 |
if (ok_blob) h_it.add_after_then_move(blob); |
|
205 |
} else if (ok_blob) { |
|
206 |
n_it.add_after_then_move(blob); |
|
207 |
}
|
|
208 |
}
|
|
209 |
}
|
|
210 |
||
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
211 |
|
212 |
// Types all the blobs as vertical or horizontal text or unknown and
|
|
213 |
// returns true if the majority are vertical.
|
|
214 |
// If the blobs are rotated, it is necessary to call CorrectForRotation
|
|
215 |
// after rotating everything, otherwise the work done here will be enough.
|
|
216 |
// If osd_blobs is not null, a list of blobs from the dominant textline
|
|
217 |
// direction are returned for use in orientation and script detection.
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
218 |
bool StrokeWidth::TestVerticalTextDirection(TO_BLOCK* block, |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
219 |
BLOBNBOX_CLIST* osd_blobs) { |
220 |
if (textord_tabfind_force_vertical_text) return true; |
|
221 |
if (!textord_tabfind_vertical_text) return false; |
|
222 |
||
223 |
int vertical_boxes = 0; |
|
224 |
int horizontal_boxes = 0; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
225 |
// Count vertical normal and large blobs.
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
226 |
BLOBNBOX_CLIST vertical_blobs; |
227 |
BLOBNBOX_CLIST horizontal_blobs; |
|
228 |
BLOBNBOX_CLIST nondescript_blobs; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
229 |
CollectHorizVertBlobs(&block->blobs, &vertical_boxes, &horizontal_boxes, |
230 |
&vertical_blobs, &horizontal_blobs, &nondescript_blobs); |
|
231 |
CollectHorizVertBlobs(&block->large_blobs, &vertical_boxes, &horizontal_boxes, |
|
232 |
&vertical_blobs, &horizontal_blobs, &nondescript_blobs); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
233 |
if (textord_debug_tabfind) |
234 |
tprintf("TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\n", |
|
235 |
horizontal_boxes, vertical_boxes, |
|
236 |
horizontal_blobs.length(), vertical_blobs.length(), |
|
237 |
nondescript_blobs.length()); |
|
238 |
if (osd_blobs != NULL && vertical_boxes == 0 && horizontal_boxes == 0) { |
|
239 |
// Only nondescript blobs available, so return those.
|
|
240 |
BLOBNBOX_C_IT osd_it(osd_blobs); |
|
241 |
osd_it.add_list_after(&nondescript_blobs); |
|
242 |
return false; |
|
243 |
}
|
|
244 |
int min_vert_boxes = static_cast<int>((vertical_boxes + horizontal_boxes) * |
|
245 |
textord_tabfind_vertical_text_ratio); |
|
246 |
if (vertical_boxes >= min_vert_boxes) { |
|
247 |
if (osd_blobs != NULL) { |
|
248 |
BLOBNBOX_C_IT osd_it(osd_blobs); |
|
249 |
osd_it.add_list_after(&vertical_blobs); |
|
250 |
}
|
|
251 |
return true; |
|
252 |
} else { |
|
253 |
if (osd_blobs != NULL) { |
|
254 |
BLOBNBOX_C_IT osd_it(osd_blobs); |
|
255 |
osd_it.add_list_after(&horizontal_blobs); |
|
256 |
}
|
|
257 |
return false; |
|
258 |
}
|
|
259 |
}
|
|
260 |
||
261 |
// Corrects the data structures for the given rotation.
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
262 |
void StrokeWidth::CorrectForRotation(const FCOORD& rotation, |
263 |
ColPartitionGrid* part_grid) { |
|
264 |
Init(part_grid->gridsize(), part_grid->bleft(), part_grid->tright()); |
|
265 |
grid_box_ = TBOX(bleft(), tright()); |
|
266 |
rerotation_.set_x(rotation.x()); |
|
267 |
rerotation_.set_y(-rotation.y()); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
268 |
}
|
269 |
||
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
270 |
// Finds leader partitions and inserts them into the given part_grid.
|
271 |
void StrokeWidth::FindLeaderPartitions(TO_BLOCK* block, |
|
272 |
ColPartitionGrid* part_grid) { |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
273 |
Clear(); |
274 |
// Find and isolate leaders in the noise list.
|
|
275 |
ColPartition_LIST leader_parts; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
276 |
FindLeadersAndMarkNoise(block, &leader_parts); |
277 |
// Setup the strokewidth grid with the block's remaining (non-noise) blobs.
|
|
278 |
InsertBlobList(&block->blobs); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
279 |
// Mark blobs that have leader neighbours.
|
280 |
for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) { |
|
281 |
ColPartition* part = it.extract(); |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
282 |
part->ClaimBoxes(); |
283 |
MarkLeaderNeighbours(part, LR_LEFT); |
|
284 |
MarkLeaderNeighbours(part, LR_RIGHT); |
|
285 |
part_grid->InsertBBox(true, true, part); |
|
286 |
}
|
|
287 |
}
|
|
288 |
||
289 |
// Finds and marks noise those blobs that look like bits of vertical lines
|
|
290 |
// that would otherwise screw up layout analysis.
|
|
291 |
void StrokeWidth::RemoveLineResidue(ColPartition_LIST* big_part_list) { |
|
292 |
BlobGridSearch gsearch(this); |
|
293 |
BLOBNBOX* bbox; |
|
294 |
// For every vertical line-like bbox in the grid, search its neighbours
|
|
295 |
// to find the tallest, and if the original box is taller by sufficient
|
|
296 |
// margin, then call it line residue and delete it.
|
|
297 |
gsearch.StartFullSearch(); |
|
298 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
299 |
TBOX box = bbox->bounding_box(); |
|
300 |
if (box.height() < box.width() * kLineResidueAspectRatio) |
|
301 |
continue; |
|
302 |
// Set up a rectangle search around the blob to find the size of its
|
|
303 |
// neighbours.
|
|
304 |
int padding = box.height() * kLineResiduePadRatio; |
|
305 |
TBOX search_box = box; |
|
306 |
search_box.pad(padding, padding); |
|
307 |
bool debug = AlignedBlob::WithinTestRegion(2, box.left(), |
|
308 |
box.bottom()); |
|
309 |
// Find the largest object in the search box not equal to bbox.
|
|
310 |
BlobGridSearch rsearch(this); |
|
311 |
int max_size = 0; |
|
312 |
BLOBNBOX* n; |
|
313 |
rsearch.StartRectSearch(search_box); |
|
314 |
while ((n = rsearch.NextRectSearch()) != NULL) { |
|
315 |
if (n == bbox) continue; |
|
316 |
TBOX nbox = n->bounding_box(); |
|
317 |
if (nbox.height() > max_size) { |
|
318 |
max_size = nbox.height(); |
|
319 |
}
|
|
320 |
}
|
|
321 |
if (debug) { |
|
322 |
tprintf("Max neighbour size=%d for candidate line box at:", max_size); |
|
323 |
box.print(); |
|
324 |
}
|
|
325 |
if (max_size * kLineResidueSizeRatio < box.height()) { |
|
519
by zdenop at gmail
fix for GRAPHICS_DISABLED build |
326 |
#ifndef GRAPHICS_DISABLED
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
327 |
if (leaders_win_ != NULL) { |
328 |
// We are debugging, so display deleted in pink blobs in the same
|
|
329 |
// window that we use to display leader detection.
|
|
330 |
leaders_win_->Pen(ScrollView::PINK); |
|
331 |
leaders_win_->Rectangle(box.left(), box.bottom(), |
|
332 |
box.right(), box.top()); |
|
333 |
}
|
|
519
by zdenop at gmail
fix for GRAPHICS_DISABLED build |
334 |
#endif // GRAPHICS_DISABLED |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
335 |
ColPartition::MakeBigPartition(bbox, big_part_list); |
336 |
}
|
|
337 |
}
|
|
338 |
}
|
|
339 |
||
340 |
// Types all the blobs as vertical text or horizontal text or unknown and
|
|
341 |
// puts them into initial ColPartitions in the supplied part_grid.
|
|
342 |
// rerotation determines how to get back to the image coordinates from the
|
|
343 |
// blob coordinates (since they may have been rotated for vertical text).
|
|
344 |
// block is the single block for the whole page or rectangle to be OCRed.
|
|
345 |
// nontext_pix (full-size), is a binary mask used to prevent merges across
|
|
346 |
// photo/text boundaries. It is not kept beyond this function.
|
|
347 |
// denorm provides a mapping back to the image from the current blob
|
|
348 |
// coordinate space.
|
|
349 |
// projection provides a measure of textline density over the image and
|
|
350 |
// provides functions to assist with diacritic detection. It should be a
|
|
351 |
// pointer to a new TextlineProjection, and will be setup here.
|
|
352 |
// part_grid is the output grid of textline partitions.
|
|
353 |
// Large blobs that cause overlap are put in separate partitions and added
|
|
354 |
// to the big_parts list.
|
|
355 |
void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation, |
|
356 |
TO_BLOCK* block, |
|
357 |
Pix* nontext_pix, |
|
358 |
const DENORM* denorm, |
|
789
by theraysmith at gmail
Fixed issue 979 |
359 |
bool cjk_script, |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
360 |
TextlineProjection* projection, |
361 |
ColPartitionGrid* part_grid, |
|
362 |
ColPartition_LIST* big_parts) { |
|
363 |
nontext_map_ = nontext_pix; |
|
364 |
projection_ = projection; |
|
365 |
denorm_ = denorm; |
|
366 |
// Clear and re Insert to take advantage of the tab stops in the blobs.
|
|
367 |
Clear(); |
|
368 |
// Setup the strokewidth grid with the remaining non-noise, non-leader blobs.
|
|
369 |
InsertBlobs(block); |
|
370 |
||
789
by theraysmith at gmail
Fixed issue 979 |
371 |
// Run FixBrokenCJK() again if the page is CJK.
|
372 |
if (cjk_script) { |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
373 |
FixBrokenCJK(block); |
374 |
}
|
|
375 |
FindTextlineFlowDirection(true); |
|
376 |
projection_->ConstructProjection(block, rerotation, nontext_map_); |
|
377 |
if (textord_tabfind_show_strokewidths) { |
|
378 |
ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs"); |
|
379 |
projection_->PlotGradedBlobs(&block->blobs, line_blobs_win); |
|
380 |
projection_->PlotGradedBlobs(&block->small_blobs, line_blobs_win); |
|
381 |
}
|
|
382 |
projection_->MoveNonTextlineBlobs(&block->blobs, &block->noise_blobs); |
|
383 |
projection_->MoveNonTextlineBlobs(&block->small_blobs, &block->noise_blobs); |
|
384 |
// Clear and re Insert to take advantage of the removed diacritics.
|
|
385 |
Clear(); |
|
386 |
InsertBlobs(block); |
|
387 |
FindInitialPartitions(rerotation, block, part_grid, big_parts); |
|
388 |
nontext_map_ = NULL; |
|
389 |
projection_ = NULL; |
|
390 |
denorm_ = NULL; |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
391 |
}
|
392 |
||
393 |
static void PrintBoxWidths(BLOBNBOX* neighbour) { |
|
394 |
TBOX nbox = neighbour->bounding_box(); |
|
395 |
tprintf("Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n", |
|
396 |
nbox.left(), nbox.bottom(), nbox.right(), nbox.top(), |
|
397 |
neighbour->horz_stroke_width(), neighbour->vert_stroke_width(), |
|
398 |
2.0 * neighbour->cblob()->area()/neighbour->cblob()->perimeter()); |
|
399 |
}
|
|
400 |
||
401 |
/** Handles a click event in a display window. */
|
|
402 |
void StrokeWidth::HandleClick(int x, int y) { |
|
403 |
BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>::HandleClick(x, y); |
|
404 |
// Run a radial search for blobs that overlap.
|
|
405 |
BlobGridSearch radsearch(this); |
|
406 |
radsearch.StartRadSearch(x, y, 1); |
|
407 |
BLOBNBOX* neighbour; |
|
408 |
FCOORD click(static_cast<float>(x), static_cast<float>(y)); |
|
409 |
while ((neighbour = radsearch.NextRadSearch()) != NULL) { |
|
410 |
TBOX nbox = neighbour->bounding_box(); |
|
411 |
if (nbox.contains(click) && neighbour->cblob() != NULL) { |
|
412 |
PrintBoxWidths(neighbour); |
|
413 |
if (neighbour->neighbour(BND_LEFT) != NULL) |
|
414 |
PrintBoxWidths(neighbour->neighbour(BND_LEFT)); |
|
415 |
if (neighbour->neighbour(BND_RIGHT) != NULL) |
|
416 |
PrintBoxWidths(neighbour->neighbour(BND_RIGHT)); |
|
417 |
if (neighbour->neighbour(BND_ABOVE) != NULL) |
|
418 |
PrintBoxWidths(neighbour->neighbour(BND_ABOVE)); |
|
419 |
if (neighbour->neighbour(BND_BELOW) != NULL) |
|
420 |
PrintBoxWidths(neighbour->neighbour(BND_BELOW)); |
|
421 |
int gaps[BND_COUNT]; |
|
422 |
neighbour->NeighbourGaps(gaps); |
|
423 |
tprintf("Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\n" |
|
424 |
"Good= %d %d %d %d\n", |
|
425 |
gaps[BND_LEFT], gaps[BND_RIGHT], |
|
426 |
gaps[BND_ABOVE], gaps[BND_BELOW], |
|
427 |
neighbour->horz_possible(), |
|
428 |
neighbour->vert_possible(), |
|
429 |
neighbour->good_stroke_neighbour(BND_LEFT), |
|
430 |
neighbour->good_stroke_neighbour(BND_RIGHT), |
|
431 |
neighbour->good_stroke_neighbour(BND_ABOVE), |
|
432 |
neighbour->good_stroke_neighbour(BND_BELOW)); |
|
433 |
break; |
|
434 |
}
|
|
435 |
}
|
|
436 |
}
|
|
437 |
||
438 |
// Detects and marks leader dots/dashes.
|
|
439 |
// Leaders are horizontal chains of small or noise blobs that look
|
|
440 |
// monospace according to ColPartition::MarkAsLeaderIfMonospaced().
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
441 |
// Detected leaders become the only occupants of the block->small_blobs list.
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
442 |
// Non-leader small blobs get moved to the blobs list.
|
443 |
// Non-leader noise blobs remain singletons in the noise list.
|
|
444 |
// All small and noise blobs in high density regions are marked BTFT_NONTEXT.
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
445 |
// block is the single block for the whole page or rectangle to be OCRed.
|
446 |
// leader_parts is the output.
|
|
447 |
void StrokeWidth::FindLeadersAndMarkNoise(TO_BLOCK* block, |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
448 |
ColPartition_LIST* leader_parts) { |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
449 |
InsertBlobList(&block->small_blobs); |
450 |
InsertBlobList(&block->noise_blobs); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
451 |
BlobGridSearch gsearch(this); |
452 |
BLOBNBOX* bbox; |
|
453 |
// For every bbox in the grid, set its neighbours.
|
|
454 |
gsearch.StartFullSearch(); |
|
455 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
456 |
SetNeighbours(true, false, bbox); |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
457 |
}
|
458 |
ColPartition_IT part_it(leader_parts); |
|
459 |
gsearch.StartFullSearch(); |
|
460 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
461 |
if (bbox->flow() == BTFT_NONE) { |
|
462 |
if (bbox->neighbour(BND_RIGHT) == NULL && |
|
463 |
bbox->neighbour(BND_LEFT) == NULL) |
|
464 |
continue; |
|
465 |
// Put all the linked blobs into a ColPartition.
|
|
466 |
ColPartition* part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1)); |
|
467 |
BLOBNBOX* blob; |
|
468 |
for (blob = bbox; blob != NULL && blob->flow() == BTFT_NONE; |
|
469 |
blob = blob->neighbour(BND_RIGHT)) |
|
470 |
part->AddBox(blob); |
|
471 |
for (blob = bbox->neighbour(BND_LEFT); blob != NULL && |
|
472 |
blob->flow() == BTFT_NONE; |
|
473 |
blob = blob->neighbour(BND_LEFT)) |
|
474 |
part->AddBox(blob); |
|
475 |
if (part->MarkAsLeaderIfMonospaced()) |
|
476 |
part_it.add_after_then_move(part); |
|
477 |
else
|
|
478 |
delete part; |
|
479 |
}
|
|
480 |
}
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
481 |
if (textord_tabfind_show_strokewidths) { |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
482 |
leaders_win_ = DisplayGoodBlobs("LeaderNeighbours", 0, 0); |
483 |
}
|
|
484 |
// Move any non-leaders from the small to the blobs list, as they are
|
|
485 |
// most likely dashes or broken characters.
|
|
486 |
BLOBNBOX_IT blob_it(&block->blobs); |
|
487 |
BLOBNBOX_IT small_it(&block->small_blobs); |
|
488 |
for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) { |
|
489 |
BLOBNBOX* blob = small_it.data(); |
|
490 |
if (blob->flow() != BTFT_LEADER) { |
|
491 |
if (blob->flow() == BTFT_NEIGHBOURS) |
|
492 |
blob->set_flow(BTFT_NONE); |
|
493 |
blob->ClearNeighbours(); |
|
494 |
blob_it.add_to_end(small_it.extract()); |
|
495 |
}
|
|
496 |
}
|
|
497 |
// Move leaders from the noise list to the small list, leaving the small
|
|
498 |
// list exclusively leaders, so they don't get processed further,
|
|
499 |
// and the remaining small blobs all in the noise list.
|
|
500 |
BLOBNBOX_IT noise_it(&block->noise_blobs); |
|
501 |
for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) { |
|
502 |
BLOBNBOX* blob = noise_it.data(); |
|
503 |
if (blob->flow() == BTFT_LEADER || blob->joined_to_prev()) { |
|
504 |
small_it.add_to_end(noise_it.extract()); |
|
505 |
} else if (blob->flow() == BTFT_NEIGHBOURS) { |
|
506 |
blob->set_flow(BTFT_NONE); |
|
507 |
blob->ClearNeighbours(); |
|
508 |
}
|
|
509 |
}
|
|
510 |
// Clear the grid as we don't want the small stuff hanging around in it.
|
|
511 |
Clear(); |
|
199
by theraysmith
Changes to textord for 3.00 |
512 |
}
|
513 |
||
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
514 |
/** Inserts the block blobs (normal and large) into this grid.
|
515 |
* Blobs remain owned by the block. */
|
|
516 |
void StrokeWidth::InsertBlobs(TO_BLOCK* block) { |
|
517 |
InsertBlobList(&block->blobs); |
|
518 |
InsertBlobList(&block->large_blobs); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
519 |
}
|
520 |
||
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
521 |
// Checks the left or right side of the given leader partition and sets the
|
522 |
// (opposite) leader_on_right or leader_on_left flags for blobs
|
|
523 |
// that are next to the given side of the given leader partition.
|
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
524 |
void StrokeWidth::MarkLeaderNeighbours(const ColPartition* part, |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
525 |
LeftOrRight side) { |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
526 |
const TBOX& part_box = part->bounding_box(); |
527 |
BlobGridSearch blobsearch(this); |
|
528 |
// Search to the side of the leader for the nearest neighbour.
|
|
529 |
BLOBNBOX* best_blob = NULL; |
|
530 |
int best_gap = 0; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
531 |
blobsearch.StartSideSearch(side == LR_LEFT ? part_box.left() |
532 |
: part_box.right(), |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
533 |
part_box.bottom(), part_box.top()); |
534 |
BLOBNBOX* blob; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
535 |
while ((blob = blobsearch.NextSideSearch(side == LR_LEFT)) != NULL) { |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
536 |
const TBOX& blob_box = blob->bounding_box(); |
537 |
if (!blob_box.y_overlap(part_box)) |
|
538 |
continue; |
|
539 |
int x_gap = blob_box.x_gap(part_box); |
|
540 |
if (x_gap > 2 * gridsize()) { |
|
541 |
break; |
|
542 |
} else if (best_blob == NULL || x_gap < best_gap) { |
|
543 |
best_blob = blob; |
|
544 |
best_gap = x_gap; |
|
545 |
}
|
|
546 |
}
|
|
547 |
if (best_blob != NULL) { |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
548 |
if (side == LR_LEFT) |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
549 |
best_blob->set_leader_on_right(true); |
550 |
else
|
|
551 |
best_blob->set_leader_on_left(true); |
|
519
by zdenop at gmail
fix for GRAPHICS_DISABLED build |
552 |
#ifndef GRAPHICS_DISABLED
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
553 |
if (leaders_win_ != NULL) { |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
554 |
leaders_win_->Pen(side == LR_LEFT ? ScrollView::RED : ScrollView::GREEN); |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
555 |
const TBOX& blob_box = best_blob->bounding_box(); |
556 |
leaders_win_->Rectangle(blob_box.left(), blob_box.bottom(), |
|
557 |
blob_box.right(), blob_box.top()); |
|
558 |
}
|
|
519
by zdenop at gmail
fix for GRAPHICS_DISABLED build |
559 |
#endif // GRAPHICS_DISABLED |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
560 |
}
|
561 |
}
|
|
562 |
||
563 |
// Helper to compute the UQ of the square-ish CJK charcters.
|
|
564 |
static int UpperQuartileCJKSize(int gridsize, BLOBNBOX_LIST* blobs) { |
|
565 |
STATS sizes(0, gridsize * kMaxCJKSizeRatio); |
|
566 |
BLOBNBOX_IT it(blobs); |
|
567 |
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
|
568 |
BLOBNBOX* blob = it.data(); |
|
569 |
int width = blob->bounding_box().width(); |
|
570 |
int height = blob->bounding_box().height(); |
|
571 |
if (width <= height * kCJKAspectRatio && height < width * kCJKAspectRatio) |
|
572 |
sizes.add(height, 1); |
|
573 |
}
|
|
574 |
return static_cast<int>(sizes.ile(0.75f) + 0.5); |
|
575 |
}
|
|
576 |
||
577 |
// Fix broken CJK characters, using the fake joined blobs mechanism.
|
|
578 |
// Blobs are really merged, ie the master takes all the outlines and the
|
|
579 |
// others are deleted.
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
580 |
// Returns true if sufficient blobs are merged that it may be worth running
|
581 |
// again, due to a better estimate of character size.
|
|
582 |
bool StrokeWidth::FixBrokenCJK(TO_BLOCK* block) { |
|
583 |
BLOBNBOX_LIST* blobs = &block->blobs; |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
584 |
int median_height = UpperQuartileCJKSize(gridsize(), blobs); |
585 |
int max_dist = static_cast<int>(median_height * kCJKBrokenDistanceFraction); |
|
586 |
int max_size = static_cast<int>(median_height * kCJKAspectRatio); |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
587 |
int num_fixed = 0; |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
588 |
BLOBNBOX_IT blob_it(blobs); |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
589 |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
590 |
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
591 |
BLOBNBOX* blob = blob_it.data(); |
|
592 |
if (blob->cblob() == NULL || blob->cblob()->out_list()->empty()) |
|
593 |
continue; |
|
594 |
TBOX bbox = blob->bounding_box(); |
|
595 |
bool debug = AlignedBlob::WithinTestRegion(3, bbox.left(), |
|
596 |
bbox.bottom()); |
|
597 |
if (debug) { |
|
598 |
tprintf("Checking for Broken CJK (max size=%d):", max_size); |
|
599 |
bbox.print(); |
|
600 |
}
|
|
601 |
// Generate a list of blobs that overlap or are near enough to merge.
|
|
602 |
BLOBNBOX_CLIST overlapped_blobs; |
|
603 |
AccumulateOverlaps(blob, debug, max_size, max_dist, |
|
604 |
&bbox, &overlapped_blobs); |
|
605 |
if (!overlapped_blobs.empty()) { |
|
606 |
// There are overlapping blobs, so qualify them as being satisfactory
|
|
607 |
// before removing them from the grid and replacing them with the union.
|
|
608 |
// The final box must be roughly square.
|
|
609 |
if (bbox.width() > bbox.height() * kCJKAspectRatio || |
|
610 |
bbox.height() > bbox.width() * kCJKAspectRatio) { |
|
611 |
if (debug) { |
|
612 |
tprintf("Bad final aspectratio:"); |
|
613 |
bbox.print(); |
|
614 |
}
|
|
615 |
continue; |
|
616 |
}
|
|
617 |
// There can't be too many blobs to merge.
|
|
618 |
if (overlapped_blobs.length() >= kCJKMaxComponents) { |
|
619 |
if (debug) |
|
620 |
tprintf("Too many neighbours: %d\n", overlapped_blobs.length()); |
|
621 |
continue; |
|
622 |
}
|
|
623 |
// The strokewidths must match amongst the join candidates.
|
|
624 |
BLOBNBOX_C_IT n_it(&overlapped_blobs); |
|
625 |
for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) { |
|
626 |
BLOBNBOX* neighbour = NULL; |
|
627 |
neighbour = n_it.data(); |
|
628 |
if (!blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionCJK, |
|
629 |
kStrokeWidthCJK)) |
|
630 |
break; |
|
631 |
}
|
|
632 |
if (!n_it.cycled_list()) { |
|
633 |
if (debug) { |
|
634 |
tprintf("Bad stroke widths:"); |
|
635 |
PrintBoxWidths(blob); |
|
636 |
}
|
|
637 |
continue; // Not good enough. |
|
638 |
}
|
|
639 |
||
640 |
// Merge all the candidates into blob.
|
|
641 |
// We must remove blob from the grid and reinsert it after merging
|
|
642 |
// to maintain the integrity of the grid.
|
|
643 |
RemoveBBox(blob); |
|
644 |
// Everything else will be calculated later.
|
|
645 |
for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) { |
|
646 |
BLOBNBOX* neighbour = n_it.data(); |
|
647 |
RemoveBBox(neighbour); |
|
623
by theraysmith at gmail
Added sparse text mode, also fixed issue 653. |
648 |
// Mark empty blob for deletion.
|
649 |
neighbour->set_region_type(BRT_NOISE); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
650 |
blob->really_merge(neighbour); |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
651 |
if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) { |
652 |
blob->rotate_box(rerotation_); |
|
653 |
}
|
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
654 |
}
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
655 |
InsertBBox(true, true, blob); |
656 |
++num_fixed; |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
657 |
if (debug) { |
658 |
tprintf("Done! Final box:"); |
|
659 |
bbox.print(); |
|
660 |
}
|
|
661 |
}
|
|
662 |
}
|
|
623
by theraysmith at gmail
Added sparse text mode, also fixed issue 653. |
663 |
// Count remaining blobs.
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
664 |
int num_remaining = 0; |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
665 |
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
666 |
BLOBNBOX* blob = blob_it.data(); |
|
623
by theraysmith at gmail
Added sparse text mode, also fixed issue 653. |
667 |
if (blob->cblob() != NULL && !blob->cblob()->out_list()->empty()) { |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
668 |
++num_remaining; |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
669 |
}
|
670 |
}
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
671 |
// Permanently delete all the marked blobs after first removing all
|
672 |
// references in the neighbour members.
|
|
673 |
block->DeleteUnownedNoise(); |
|
674 |
return num_fixed > num_remaining * kBrokenCJKIterationFraction; |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
675 |
}
|
676 |
||
677 |
// Helper function to determine whether it is reasonable to merge the
|
|
678 |
// bbox and the nbox for repairing broken CJK.
|
|
679 |
// The distance apart must not exceed max_dist, the combined size must
|
|
680 |
// not exceed max_size, and the aspect ratio must either improve or at
|
|
681 |
// least not get worse by much.
|
|
682 |
static bool AcceptableCJKMerge(const TBOX& bbox, const TBOX& nbox, |
|
683 |
bool debug, int max_size, int max_dist, |
|
684 |
int* x_gap, int* y_gap) { |
|
685 |
*x_gap = bbox.x_gap(nbox); |
|
686 |
*y_gap = bbox.y_gap(nbox); |
|
687 |
TBOX merged(nbox); |
|
688 |
merged += bbox; |
|
689 |
if (debug) { |
|
690 |
tprintf("gaps = %d, %d, merged_box:", *x_gap, *y_gap); |
|
691 |
merged.print(); |
|
692 |
}
|
|
693 |
if (*x_gap <= max_dist && *y_gap <= max_dist && |
|
694 |
merged.width() <= max_size && merged.height() <= max_size) { |
|
695 |
// Close enough to call overlapping. Check aspect ratios.
|
|
696 |
double old_ratio = static_cast<double>(bbox.width()) / bbox.height(); |
|
697 |
if (old_ratio < 1.0) old_ratio = 1.0 / old_ratio; |
|
698 |
double new_ratio = static_cast<double>(merged.width()) / merged.height(); |
|
699 |
if (new_ratio < 1.0) new_ratio = 1.0 / new_ratio; |
|
700 |
if (new_ratio <= old_ratio * kCJKAspectRatioIncrease) |
|
701 |
return true; |
|
702 |
}
|
|
703 |
return false; |
|
704 |
}
|
|
705 |
||
706 |
// Collect blobs that overlap or are within max_dist of the input bbox.
|
|
707 |
// Return them in the list of blobs and expand the bbox to be the union
|
|
708 |
// of all the boxes. not_this is excluded from the search, as are blobs
|
|
709 |
// that cause the merged box to exceed max_size in either dimension.
|
|
710 |
void StrokeWidth::AccumulateOverlaps(const BLOBNBOX* not_this, bool debug, |
|
711 |
int max_size, int max_dist, |
|
712 |
TBOX* bbox, BLOBNBOX_CLIST* blobs) { |
|
713 |
// While searching, nearests holds the nearest failed blob in each
|
|
714 |
// direction. When we have a nearest in each of the 4 directions, then
|
|
715 |
// the search is over, and at this point the final bbox must not overlap
|
|
716 |
// any of the nearests.
|
|
717 |
BLOBNBOX* nearests[BND_COUNT]; |
|
718 |
for (int i = 0; i < BND_COUNT; ++i) { |
|
719 |
nearests[i] = NULL; |
|
720 |
}
|
|
721 |
int x = (bbox->left() + bbox->right()) / 2; |
|
722 |
int y = (bbox->bottom() + bbox->top()) / 2; |
|
723 |
// Run a radial search for blobs that overlap or are sufficiently close.
|
|
724 |
BlobGridSearch radsearch(this); |
|
725 |
radsearch.StartRadSearch(x, y, kCJKRadius); |
|
726 |
BLOBNBOX* neighbour; |
|
727 |
while ((neighbour = radsearch.NextRadSearch()) != NULL) { |
|
728 |
if (neighbour == not_this) continue; |
|
729 |
TBOX nbox = neighbour->bounding_box(); |
|
730 |
int x_gap, y_gap; |
|
731 |
if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist, |
|
732 |
&x_gap, &y_gap)) { |
|
733 |
// Close enough to call overlapping. Merge boxes.
|
|
734 |
*bbox += nbox; |
|
735 |
blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour); |
|
736 |
if (debug) { |
|
737 |
tprintf("Added:"); |
|
738 |
nbox.print(); |
|
739 |
}
|
|
740 |
// Since we merged, search the nearests, as some might now me mergeable.
|
|
741 |
for (int dir = 0; dir < BND_COUNT; ++dir) { |
|
742 |
if (nearests[dir] == NULL) continue; |
|
743 |
nbox = nearests[dir]->bounding_box(); |
|
744 |
if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, |
|
745 |
max_dist, &x_gap, &y_gap)) { |
|
746 |
// Close enough to call overlapping. Merge boxes.
|
|
747 |
*bbox += nbox; |
|
748 |
blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, nearests[dir]); |
|
749 |
if (debug) { |
|
750 |
tprintf("Added:"); |
|
751 |
nbox.print(); |
|
752 |
}
|
|
753 |
nearests[dir] = NULL; |
|
754 |
dir = -1; // Restart the search. |
|
755 |
}
|
|
756 |
}
|
|
757 |
} else if (x_gap < 0 && x_gap <= y_gap) { |
|
758 |
// A vertical neighbour. Record the nearest.
|
|
759 |
BlobNeighbourDir dir = nbox.top() > bbox->top() ? BND_ABOVE : BND_BELOW; |
|
760 |
if (nearests[dir] == NULL || |
|
761 |
y_gap < bbox->y_gap(nearests[dir]->bounding_box())) { |
|
762 |
nearests[dir] = neighbour; |
|
763 |
}
|
|
764 |
} else if (y_gap < 0 && y_gap <= x_gap) { |
|
765 |
// A horizontal neighbour. Record the nearest.
|
|
766 |
BlobNeighbourDir dir = nbox.left() > bbox->left() ? BND_RIGHT : BND_LEFT; |
|
767 |
if (nearests[dir] == NULL || |
|
768 |
x_gap < bbox->x_gap(nearests[dir]->bounding_box())) { |
|
769 |
nearests[dir] = neighbour; |
|
770 |
}
|
|
771 |
}
|
|
772 |
// If all nearests are non-null, then we have finished.
|
|
773 |
if (nearests[BND_LEFT] && nearests[BND_RIGHT] && |
|
774 |
nearests[BND_ABOVE] && nearests[BND_BELOW]) |
|
775 |
break; |
|
776 |
}
|
|
777 |
// Final overlap with a nearest is not allowed.
|
|
778 |
for (int dir = 0; dir < BND_COUNT; ++dir) { |
|
779 |
if (nearests[dir] == NULL) continue; |
|
780 |
const TBOX& nbox = nearests[dir]->bounding_box(); |
|
781 |
if (debug) { |
|
782 |
tprintf("Testing for overlap with:"); |
|
783 |
nbox.print(); |
|
784 |
}
|
|
785 |
if (bbox->overlap(nbox)) { |
|
786 |
blobs->shallow_clear(); |
|
787 |
if (debug) |
|
788 |
tprintf("Final box overlaps nearest\n"); |
|
789 |
return; |
|
790 |
}
|
|
791 |
}
|
|
792 |
}
|
|
793 |
||
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
794 |
// For each blob in this grid, Finds the textline direction to be horizontal
|
795 |
// or vertical according to distance to neighbours and 1st and 2nd order
|
|
796 |
// neighbours. Non-text tends to end up without a definite direction.
|
|
797 |
// Result is setting of the neighbours and vert_possible/horz_possible
|
|
798 |
// flags in the BLOBNBOXes currently in this grid.
|
|
799 |
// This function is called more than once if page orientation is uncertain,
|
|
800 |
// so display_if_debugging is true on the final call to display the results.
|
|
801 |
void StrokeWidth::FindTextlineFlowDirection(bool display_if_debugging) { |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
802 |
BlobGridSearch gsearch(this); |
803 |
BLOBNBOX* bbox; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
804 |
// For every bbox in the grid, set its neighbours.
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
805 |
gsearch.StartFullSearch(); |
806 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
807 |
SetNeighbours(false, display_if_debugging, bbox); |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
808 |
}
|
809 |
// Where vertical or horizontal wins by a big margin, clarify it.
|
|
810 |
gsearch.StartFullSearch(); |
|
811 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
812 |
SimplifyObviousNeighbours(bbox); |
|
813 |
}
|
|
814 |
// Now try to make the blobs only vertical or horizontal using neighbours.
|
|
815 |
gsearch.StartFullSearch(); |
|
816 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
817 |
SetNeighbourFlows(bbox); |
|
818 |
}
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
819 |
if ((textord_tabfind_show_strokewidths && display_if_debugging) || |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
820 |
textord_tabfind_show_strokewidths > 1) { |
821 |
initial_widths_win_ = DisplayGoodBlobs("InitialStrokewidths", 400, 0); |
|
822 |
}
|
|
823 |
// Improve flow direction with neighbours.
|
|
824 |
gsearch.StartFullSearch(); |
|
825 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
826 |
SmoothNeighbourTypes(bbox, false); |
|
827 |
}
|
|
828 |
// Now allow reset of firm values to fix renegades.
|
|
829 |
gsearch.StartFullSearch(); |
|
830 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
831 |
SmoothNeighbourTypes(bbox, true); |
|
832 |
}
|
|
833 |
// Repeat.
|
|
834 |
gsearch.StartFullSearch(); |
|
835 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
836 |
SmoothNeighbourTypes(bbox, true); |
|
837 |
}
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
838 |
if ((textord_tabfind_show_strokewidths && display_if_debugging) || |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
839 |
textord_tabfind_show_strokewidths > 1) { |
840 |
widths_win_ = DisplayGoodBlobs("ImprovedStrokewidths", 800, 0); |
|
841 |
}
|
|
842 |
}
|
|
843 |
||
844 |
// Sets the neighbours and good_stroke_neighbours members of the blob by
|
|
845 |
// searching close on all 4 sides.
|
|
846 |
// When finding leader dots/dashes, there is a slightly different rule for
|
|
847 |
// what makes a good neighbour.
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
848 |
void StrokeWidth::SetNeighbours(bool leaders, bool activate_line_trap, |
849 |
BLOBNBOX* blob) { |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
850 |
int line_trap_count = 0; |
851 |
for (int dir = 0; dir < BND_COUNT; ++dir) { |
|
852 |
BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir); |
|
853 |
line_trap_count += FindGoodNeighbour(bnd, leaders, blob); |
|
854 |
}
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
855 |
if (line_trap_count > 0 && activate_line_trap) { |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
856 |
// It looks like a line so isolate it by clearing its neighbours.
|
857 |
blob->ClearNeighbours(); |
|
858 |
const TBOX& box = blob->bounding_box(); |
|
859 |
blob->set_region_type(box.width() > box.height() ? BRT_HLINE : BRT_VLINE); |
|
860 |
}
|
|
861 |
}
|
|
862 |
||
863 |
||
864 |
// Sets the good_stroke_neighbours member of the blob if it has a
|
|
865 |
// GoodNeighbour on the given side.
|
|
866 |
// Also sets the neighbour in the blob, whether or not a good one is found.
|
|
867 |
// Returns the number of blobs in the nearby search area that would lead us to
|
|
868 |
// believe that this blob is a line separator.
|
|
869 |
// Leaders get extra special lenient treatment.
|
|
870 |
int StrokeWidth::FindGoodNeighbour(BlobNeighbourDir dir, bool leaders, |
|
871 |
BLOBNBOX* blob) { |
|
872 |
// Search for neighbours that overlap vertically.
|
|
873 |
TBOX blob_box = blob->bounding_box(); |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
874 |
bool debug = AlignedBlob::WithinTestRegion(2, blob_box.left(), |
875 |
blob_box.bottom()); |
|
876 |
if (debug) { |
|
877 |
tprintf("FGN in dir %d for blob:", dir); |
|
878 |
blob_box.print(); |
|
879 |
}
|
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
880 |
int top = blob_box.top(); |
881 |
int bottom = blob_box.bottom(); |
|
882 |
int left = blob_box.left(); |
|
883 |
int right = blob_box.right(); |
|
884 |
int width = right - left; |
|
885 |
int height = top - bottom; |
|
886 |
||
887 |
// A trap to detect lines tests for the min dimension of neighbours
|
|
888 |
// being larger than a multiple of the min dimension of the line
|
|
889 |
// and the larger dimension being smaller than a fraction of the max
|
|
890 |
// dimension of the line.
|
|
891 |
int line_trap_max = MAX(width, height) / kLineTrapLongest; |
|
892 |
int line_trap_min = MIN(width, height) * kLineTrapShortest; |
|
893 |
int line_trap_count = 0; |
|
894 |
||
895 |
int min_good_overlap = (dir == BND_LEFT || dir == BND_RIGHT) |
|
896 |
? height / 2 : width / 2; |
|
897 |
int min_decent_overlap = (dir == BND_LEFT || dir == BND_RIGHT) |
|
898 |
? height / 3 : width / 3; |
|
899 |
if (leaders) |
|
900 |
min_good_overlap = min_decent_overlap = 1; |
|
901 |
||
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
902 |
int search_pad = static_cast<int>( |
903 |
sqrt(static_cast<double>(width * height)) * kNeighbourSearchFactor); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
904 |
if (gridsize() > search_pad) |
905 |
search_pad = gridsize(); |
|
906 |
TBOX search_box = blob_box; |
|
907 |
// Pad the search in the appropriate direction.
|
|
908 |
switch (dir) { |
|
909 |
case BND_LEFT: |
|
910 |
search_box.set_left(search_box.left() - search_pad); |
|
911 |
break; |
|
912 |
case BND_RIGHT: |
|
913 |
search_box.set_right(search_box.right() + search_pad); |
|
914 |
break; |
|
915 |
case BND_BELOW: |
|
916 |
search_box.set_bottom(search_box.bottom() - search_pad); |
|
917 |
break; |
|
918 |
case BND_ABOVE: |
|
919 |
search_box.set_top(search_box.top() + search_pad); |
|
920 |
break; |
|
921 |
case BND_COUNT: |
|
922 |
return 0; |
|
923 |
}
|
|
924 |
||
925 |
BlobGridSearch rectsearch(this); |
|
926 |
rectsearch.StartRectSearch(search_box); |
|
927 |
BLOBNBOX* best_neighbour = NULL; |
|
928 |
double best_goodness = 0.0; |
|
929 |
bool best_is_good = false; |
|
930 |
BLOBNBOX* neighbour; |
|
931 |
while ((neighbour = rectsearch.NextRectSearch()) != NULL) { |
|
932 |
TBOX nbox = neighbour->bounding_box(); |
|
933 |
if (neighbour == blob) |
|
934 |
continue; |
|
935 |
int mid_x = (nbox.left() + nbox.right()) / 2; |
|
936 |
if (mid_x < blob->left_rule() || mid_x > blob->right_rule()) |
|
937 |
continue; // In a different column. |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
938 |
if (debug) { |
939 |
tprintf("Neighbour at:"); |
|
940 |
nbox.print(); |
|
941 |
}
|
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
942 |
|
943 |
// Last-minute line detector. There is a small upper limit to the line
|
|
944 |
// width accepted by the morphological line detector.
|
|
945 |
int n_width = nbox.width(); |
|
946 |
int n_height = nbox.height(); |
|
947 |
if (MIN(n_width, n_height) > line_trap_min && |
|
948 |
MAX(n_width, n_height) < line_trap_max) |
|
949 |
++line_trap_count; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
950 |
// Heavily joined text, such as Arabic may have very different sizes when
|
951 |
// looking at the maxes, but the heights may be almost identical, so check
|
|
952 |
// for a difference in height if looking sideways or width vertically.
|
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
953 |
if (TabFind::VeryDifferentSizes(MAX(n_width, n_height), |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
954 |
MAX(width, height)) && |
955 |
(((dir == BND_LEFT || dir ==BND_RIGHT) && |
|
956 |
TabFind::DifferentSizes(n_height, height)) || |
|
957 |
((dir == BND_BELOW || dir ==BND_ABOVE) && |
|
958 |
TabFind::DifferentSizes(n_width, width)))) { |
|
959 |
if (debug) tprintf("Bad size\n"); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
960 |
continue; // Could be a different font size or non-text. |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
961 |
}
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
962 |
// Amount of vertical overlap between the blobs.
|
963 |
int overlap; |
|
964 |
// If the overlap is along the short side of the neighbour, and it
|
|
965 |
// is fully overlapped, then perp_overlap holds the length of the long
|
|
966 |
// side of the neighbour. A measure to include hyphens and dashes as
|
|
967 |
// legitimate neighbours.
|
|
968 |
int perp_overlap; |
|
969 |
int gap; |
|
970 |
if (dir == BND_LEFT || dir == BND_RIGHT) { |
|
971 |
overlap = MIN(nbox.top(), top) - MAX(nbox.bottom(), bottom); |
|
972 |
if (overlap == nbox.height() && nbox.width() > nbox.height()) |
|
973 |
perp_overlap = nbox.width(); |
|
974 |
else
|
|
975 |
perp_overlap = overlap; |
|
976 |
gap = dir == BND_LEFT ? left - nbox.left() : nbox.right() - right; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
977 |
if (gap <= 0) { |
978 |
if (debug) tprintf("On wrong side\n"); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
979 |
continue; // On the wrong side. |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
980 |
}
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
981 |
gap -= n_width; |
982 |
} else { |
|
983 |
overlap = MIN(nbox.right(), right) - MAX(nbox.left(), left); |
|
984 |
if (overlap == nbox.width() && nbox.height() > nbox.width()) |
|
985 |
perp_overlap = nbox.height(); |
|
986 |
else
|
|
987 |
perp_overlap = overlap; |
|
988 |
gap = dir == BND_BELOW ? bottom - nbox.bottom() : nbox.top() - top; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
989 |
if (gap <= 0) { |
990 |
if (debug) tprintf("On wrong side\n"); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
991 |
continue; // On the wrong side. |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
992 |
}
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
993 |
gap -= n_height; |
994 |
}
|
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
995 |
if (-gap > overlap) { |
996 |
if (debug) tprintf("Overlaps wrong way\n"); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
997 |
continue; // Overlaps the wrong way. |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
998 |
}
|
999 |
if (perp_overlap < min_decent_overlap) { |
|
1000 |
if (debug) tprintf("Doesn't overlap enough\n"); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
1001 |
continue; // Doesn't overlap enough. |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
1002 |
}
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
1003 |
bool bad_sizes = TabFind::DifferentSizes(height, n_height) && |
1004 |
TabFind::DifferentSizes(width, n_width); |
|
1005 |
bool is_good = overlap >= min_good_overlap && !bad_sizes && |
|
1006 |
blob->MatchingStrokeWidth(*neighbour, |
|
1007 |
kStrokeWidthFractionTolerance, |
|
1008 |
kStrokeWidthTolerance); |
|
1009 |
// Best is a fuzzy combination of gap, overlap and is good.
|
|
1010 |
// Basically if you make one thing twice as good without making
|
|
1011 |
// anything else twice as bad, then it is better.
|
|
1012 |
if (gap < 1) gap = 1; |
|
1013 |
double goodness = (1.0 + is_good) * overlap / gap; |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
1014 |
if (debug) { |
1015 |
tprintf("goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\n", |
|
1016 |
goodness, best_goodness, is_good, overlap, gap); |
|
1017 |
}
|
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
1018 |
if (goodness > best_goodness) { |
1019 |
best_neighbour = neighbour; |
|
1020 |
best_goodness = goodness; |
|
1021 |
best_is_good = is_good; |
|
1022 |
}
|
|
1023 |
}
|
|
1024 |
blob->set_neighbour(dir, best_neighbour, best_is_good); |
|
1025 |
return line_trap_count; |
|
1026 |
}
|
|
1027 |
||
1028 |
// Helper to get a list of 1st-order neighbours.
|
|
1029 |
static void ListNeighbours(const BLOBNBOX* blob, |
|
1030 |
BLOBNBOX_CLIST* neighbours) { |
|
1031 |
for (int dir = 0; dir < BND_COUNT; ++dir) { |
|
1032 |
BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir); |
|
1033 |
BLOBNBOX* neighbour = blob->neighbour(bnd); |
|
1034 |
if (neighbour != NULL) { |
|
1035 |
neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour); |
|
1036 |
}
|
|
1037 |
}
|
|
1038 |
}
|
|
1039 |
||
1040 |
// Helper to get a list of 1st and 2nd order neighbours.
|
|
1041 |
static void List2ndNeighbours(const BLOBNBOX* blob, |
|
1042 |
BLOBNBOX_CLIST* neighbours) { |
|
1043 |
ListNeighbours(blob, neighbours); |
|
1044 |
for (int dir = 0; dir < BND_COUNT; ++dir) { |
|
1045 |
BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir); |
|
1046 |
BLOBNBOX* neighbour = blob->neighbour(bnd); |
|
1047 |
if (neighbour != NULL) { |
|
1048 |
ListNeighbours(neighbour, neighbours); |
|
1049 |
}
|
|
1050 |
}
|
|
1051 |
}
|
|
1052 |
||
1053 |
// Helper to get a list of 1st, 2nd and 3rd order neighbours.
|
|
1054 |
static void List3rdNeighbours(const BLOBNBOX* blob, |
|
1055 |
BLOBNBOX_CLIST* neighbours) { |
|
1056 |
List2ndNeighbours(blob, neighbours); |
|
1057 |
for (int dir = 0; dir < BND_COUNT; ++dir) { |
|
1058 |
BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir); |
|
1059 |
BLOBNBOX* neighbour = blob->neighbour(bnd); |
|
1060 |
if (neighbour != NULL) { |
|
1061 |
List2ndNeighbours(neighbour, neighbours); |
|
1062 |
}
|
|
1063 |
}
|
|
1064 |
}
|
|
1065 |
||
1066 |
// Helper to count the evidence for verticalness or horizontalness
|
|
1067 |
// in a list of neighbours.
|
|
1068 |
static void CountNeighbourGaps(bool debug, BLOBNBOX_CLIST* neighbours, |
|
1069 |
int* pure_h_count, int* pure_v_count) { |
|
1070 |
if (neighbours->length() <= kMostlyOneDirRatio) |
|
1071 |
return; |
|
1072 |
BLOBNBOX_C_IT it(neighbours); |
|
1073 |
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
|
1074 |
BLOBNBOX* blob = it.data(); |
|
1075 |
int h_min, h_max, v_min, v_max; |
|
1076 |
blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max); |
|
1077 |
if (debug) |
|
1078 |
tprintf("Hgaps [%d,%d], vgaps [%d,%d]:", h_min, h_max, v_min, v_max); |
|
1079 |
if (h_max < v_min || |
|
1080 |
blob->leader_on_left() || blob->leader_on_right()) { |
|
1081 |
// Horizontal gaps are clear winners. Count a pure horizontal.
|
|
1082 |
++*pure_h_count; |
|
1083 |
if (debug) tprintf("Horz at:"); |
|
1084 |
} else if (v_max < h_min) { |
|
1085 |
// Vertical gaps are clear winners. Clear a pure vertical.
|
|
1086 |
++*pure_v_count; |
|
1087 |
if (debug) tprintf("Vert at:"); |
|
1088 |
} else { |
|
1089 |
if (debug) tprintf("Neither at:"); |
|
1090 |
}
|
|
1091 |
if (debug) |
|
1092 |
blob->bounding_box().print(); |
|
1093 |
}
|
|
1094 |
}
|
|
1095 |
||
1096 |
// Makes the blob to be only horizontal or vertical where evidence
|
|
1097 |
// is clear based on gaps of 2nd order neighbours, or definite individual
|
|
1098 |
// blobs.
|
|
1099 |
void StrokeWidth::SetNeighbourFlows(BLOBNBOX* blob) { |
|
1100 |
if (blob->DefiniteIndividualFlow()) |
|
1101 |
return; |
|
1102 |
bool debug = AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(), |
|
1103 |
blob->bounding_box().bottom()); |
|
1104 |
if (debug) { |
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
1105 |
tprintf("SetNeighbourFlows (current flow=%d, type=%d) on:", |
1106 |
blob->flow(), blob->region_type()); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
1107 |
blob->bounding_box().print(); |
1108 |
}
|
|
1109 |
BLOBNBOX_CLIST neighbours; |
|
1110 |
List3rdNeighbours(blob, &neighbours); |
|
1111 |
// The number of pure horizontal and vertical neighbours.
|
|
1112 |
int pure_h_count = 0; |
|
1113 |
int pure_v_count = 0; |
|
1114 |
CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count); |
|
1115 |
if (debug) { |
|
1116 |
HandleClick(blob->bounding_box().left() + 1, |
|
1117 |
blob->bounding_box().bottom() + 1); |
|
1118 |
tprintf("SetFlows: h_count=%d, v_count=%d\n", |
|
1119 |
pure_h_count, pure_v_count); |
|
1120 |
}
|
|
1121 |
if (!neighbours.empty()) { |
|
1122 |
blob->set_vert_possible(true); |
|
1123 |
blob->set_horz_possible(true); |
|
1124 |
if (pure_h_count > 2 * pure_v_count) { |
|
1125 |
// Horizontal gaps are clear winners. Clear vertical neighbours.
|
|
1126 |
blob->set_vert_possible(false); |
|
1127 |
} else if (pure_v_count > 2 * pure_h_count) { |
|
1128 |
// Vertical gaps are clear winners. Clear horizontal neighbours.
|
|
1129 |
blob->set_horz_possible(false); |
|
1130 |
}
|
|
1131 |
} else { |
|
1132 |
// Lonely blob. Can't tell its flow direction.
|
|
1133 |
blob->set_vert_possible(false); |
|
1134 |
blob->set_horz_possible(false); |
|
1135 |
}
|
|
1136 |
}
|
|
1137 |
||
1138 |
||
1139 |
// Helper to count the number of horizontal and vertical blobs in a list.
|
|
1140 |
static void CountNeighbourTypes(BLOBNBOX_CLIST* neighbours, |
|
1141 |
int* pure_h_count, int* pure_v_count) { |
|
1142 |
BLOBNBOX_C_IT it(neighbours); |
|
1143 |
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
|
1144 |
BLOBNBOX* blob = it.data(); |
|
1145 |
if (blob->UniquelyHorizontal()) |
|
1146 |
++*pure_h_count; |
|
1147 |
if (blob->UniquelyVertical()) |
|
1148 |
++*pure_v_count; |
|
1149 |
}
|
|
1150 |
}
|
|
1151 |
||
1152 |
// Nullify the neighbours in the wrong directions where the direction
|
|
1153 |
// is clear-cut based on a distance margin. Good for isolating vertical
|
|
1154 |
// text from neighbouring horizontal text.
|
|
1155 |
void StrokeWidth::SimplifyObviousNeighbours(BLOBNBOX* blob) { |
|
391
by theraysmith
Removal of NEWDELETE + fix of problem with joined text |
1156 |
// Case 1: We have text that is likely several characters, blurry and joined
|
1157 |
// together.
|
|
1158 |
if ((blob->bounding_box().width() > 3 * blob->area_stroke_width() && |
|
1159 |
blob->bounding_box().height() > 3 * blob->area_stroke_width())) { |
|
1160 |
// The blob is complex (not stick-like).
|
|
1161 |
if (blob->bounding_box().width() > 4 * blob->bounding_box().height()) { |
|
1162 |
// Horizontal conjoined text.
|
|
1163 |
blob->set_neighbour(BND_ABOVE, NULL, false); |
|
1164 |
blob->set_neighbour(BND_BELOW, NULL, false); |
|
1165 |
return; |
|
1166 |
}
|
|
1167 |
if (blob->bounding_box().height() > 4 * blob->bounding_box().width()) { |
|
1168 |
// Vertical conjoined text.
|
|
1169 |
blob->set_neighbour(BND_LEFT, NULL, false); |
|
1170 |
blob->set_neighbour(BND_RIGHT, NULL, false); |
|
1171 |
return; |
|
1172 |
}
|
|
1173 |
}
|
|
1174 |
||
1175 |
// Case 2: This blob is likely a single character.
|
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
1176 |
int margin = gridsize() / 2; |
1177 |
int h_min, h_max, v_min, v_max; |
|
1178 |
blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max); |
|
1179 |
if ((h_max + margin < v_min && h_max < margin / 2) || |
|
1180 |
blob->leader_on_left() || blob->leader_on_right()) { |
|
1181 |
// Horizontal gaps are clear winners. Clear vertical neighbours.
|
|
1182 |
blob->set_neighbour(BND_ABOVE, NULL, false); |
|
1183 |
blob->set_neighbour(BND_BELOW, NULL, false); |
|
1184 |
} else if (v_max + margin < h_min && v_max < margin / 2) { |
|
1185 |
// Vertical gaps are clear winners. Clear horizontal neighbours.
|
|
1186 |
blob->set_neighbour(BND_LEFT, NULL, false); |
|
1187 |
blob->set_neighbour(BND_RIGHT, NULL, false); |
|
1188 |
}
|
|
1189 |
}
|
|
1190 |
||
1191 |
// Smoothes the vertical/horizontal type of the blob based on the
|
|
1192 |
// 2nd-order neighbours. If reset_all is true, then all blobs are
|
|
1193 |
// changed. Otherwise, only ambiguous blobs are processed.
|
|
1194 |
void StrokeWidth::SmoothNeighbourTypes(BLOBNBOX* blob, bool reset_all) { |
|
1195 |
if ((blob->vert_possible() && blob->horz_possible()) || reset_all) { |
|
1196 |
// There are both horizontal and vertical so try to fix it.
|
|
1197 |
BLOBNBOX_CLIST neighbours; |
|
1198 |
List2ndNeighbours(blob, &neighbours); |
|
1199 |
// The number of pure horizontal and vertical neighbours.
|
|
1200 |
int pure_h_count = 0; |
|
1201 |
int pure_v_count = 0; |
|
1202 |
CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count); |
|
1203 |
if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(), |
|
1204 |
blob->bounding_box().bottom())) { |
|
1205 |
HandleClick(blob->bounding_box().left() + 1, |
|
1206 |
blob->bounding_box().bottom() + 1); |
|
1207 |
tprintf("pure_h=%d, pure_v=%d\n", |
|
1208 |
pure_h_count, pure_v_count); |
|
1209 |
}
|
|
1210 |
if (pure_h_count > pure_v_count) { |
|
1211 |
// Horizontal gaps are clear winners. Clear vertical neighbours.
|
|
1212 |
blob->set_vert_possible(false); |
|
1213 |
blob->set_horz_possible(true); |
|
1214 |
} else if (pure_v_count > pure_h_count) { |
|
1215 |
// Vertical gaps are clear winners. Clear horizontal neighbours.
|
|
1216 |
blob->set_horz_possible(false); |
|
1217 |
blob->set_vert_possible(true); |
|
1218 |
}
|
|
1219 |
} else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(), |
|
1220 |
blob->bounding_box().bottom())) { |
|
1221 |
HandleClick(blob->bounding_box().left() + 1, |
|
1222 |
blob->bounding_box().bottom() + 1); |
|
1223 |
tprintf("Clean on pass 3!\n"); |
|
1224 |
}
|
|
1225 |
}
|
|
1226 |
||
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
1227 |
// Partition creation. Accumulates vertical and horizontal text chains,
|
1228 |
// puts the remaining blobs in as unknowns, and then merges/splits to
|
|
1229 |
// minimize overlap and smoothes the types with neighbours and the color
|
|
1230 |
// image if provided. rerotation is used to rotate the coordinate space
|
|
1231 |
// back to the nontext_map_ image.
|
|
1232 |
void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, |
|
1233 |
TO_BLOCK* block, |
|
1234 |
ColPartitionGrid* part_grid, |
|
1235 |
ColPartition_LIST* big_parts) { |
|
1236 |
FindVerticalTextChains(part_grid); |
|
1237 |
FindHorizontalTextChains(part_grid); |
|
1238 |
if (textord_tabfind_show_strokewidths) { |
|
1239 |
chains_win_ = MakeWindow(0, 400, "Initial text chains"); |
|
1240 |
part_grid->DisplayBoxes(chains_win_); |
|
1241 |
projection_->DisplayProjection(); |
|
1242 |
}
|
|
1243 |
part_grid->SplitOverlappingPartitions(big_parts); |
|
1244 |
EasyMerges(part_grid); |
|
1245 |
RemoveLargeUnusedBlobs(block, part_grid, big_parts); |
|
1246 |
TBOX grid_box(bleft(), tright()); |
|
1247 |
while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box, |
|
1248 |
rerotation)); |
|
1249 |
while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_, |
|
1250 |
grid_box, rerotation)); |
|
1251 |
TestDiacritics(part_grid, block); |
|
1252 |
MergeDiacritics(block, part_grid); |
|
1253 |
if (textord_tabfind_show_strokewidths) { |
|
1254 |
textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs"); |
|
1255 |
part_grid->DisplayBoxes(textlines_win_); |
|
1256 |
diacritics_win_ = DisplayDiacritics("Diacritics", 0, 0, block); |
|
1257 |
}
|
|
1258 |
PartitionRemainingBlobs(part_grid); |
|
1259 |
part_grid->SplitOverlappingPartitions(big_parts); |
|
1260 |
EasyMerges(part_grid); |
|
1261 |
while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box, |
|
1262 |
rerotation)); |
|
1263 |
while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_, |
|
1264 |
grid_box, rerotation)); |
|
1265 |
// Now eliminate strong stuff in a sea of the opposite.
|
|
1266 |
while (part_grid->GridSmoothNeighbours(BTFT_STRONG_CHAIN, nontext_map_, |
|
1267 |
grid_box, rerotation)); |
|
1268 |
if (textord_tabfind_show_strokewidths) { |
|
1269 |
smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs"); |
|
1270 |
part_grid->DisplayBoxes(smoothed_win_); |
|
1271 |
}
|
|
1272 |
}
|
|
1273 |
||
1274 |
// Helper verifies that blob's neighbour in direction dir is good to add to a
|
|
1275 |
// vertical text chain by returning the neighbour if it is not null, not owned,
|
|
1276 |
// and not uniquely horizontal, as well as its neighbour in the opposite
|
|
1277 |
// direction is blob.
|
|
1278 |
static BLOBNBOX* MutualUnusedVNeighbour(const BLOBNBOX* blob, |
|
1279 |
BlobNeighbourDir dir) { |
|
1280 |
BLOBNBOX* next_blob = blob->neighbour(dir); |
|
1281 |
if (next_blob == NULL || next_blob->owner() != NULL || |
|
1282 |
next_blob->UniquelyHorizontal()) |
|
1283 |
return NULL; |
|
1284 |
if (next_blob->neighbour(DirOtherWay(dir)) == blob) |
|
1285 |
return next_blob; |
|
1286 |
return NULL; |
|
1287 |
}
|
|
1288 |
||
1289 |
// Finds vertical chains of text-like blobs and puts them in ColPartitions.
|
|
1290 |
void StrokeWidth::FindVerticalTextChains(ColPartitionGrid* part_grid) { |
|
1291 |
BlobGridSearch gsearch(this); |
|
1292 |
BLOBNBOX* bbox; |
|
1293 |
gsearch.StartFullSearch(); |
|
1294 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
1295 |
// Only process boxes that have no horizontal hope and have not yet
|
|
1296 |
// been included in a chain.
|
|
1297 |
BLOBNBOX* blob; |
|
1298 |
if (bbox->owner() == NULL && bbox->UniquelyVertical() && |
|
1299 |
(blob = MutualUnusedVNeighbour(bbox, BND_ABOVE)) != NULL) { |
|
1300 |
// Put all the linked blobs into a ColPartition.
|
|
1301 |
ColPartition* part = new ColPartition(BRT_VERT_TEXT, ICOORD(0, 1)); |
|
1302 |
part->AddBox(bbox); |
|
1303 |
while (blob != NULL) { |
|
1304 |
part->AddBox(blob); |
|
1305 |
blob = MutualUnusedVNeighbour(blob, BND_ABOVE); |
|
1306 |
}
|
|
1307 |
blob = MutualUnusedVNeighbour(bbox, BND_BELOW); |
|
1308 |
while (blob != NULL) { |
|
1309 |
part->AddBox(blob); |
|
1310 |
blob = MutualUnusedVNeighbour(blob, BND_BELOW); |
|
1311 |
}
|
|
1312 |
CompletePartition(part, part_grid); |
|
1313 |
}
|
|
1314 |
}
|
|
1315 |
}
|
|
1316 |
||
1317 |
// Helper verifies that blob's neighbour in direction dir is good to add to a
|
|
1318 |
// horizontal text chain by returning the neighbour if it is not null, not
|
|
1319 |
// owned, and not uniquely vertical, as well as its neighbour in the opposite
|
|
1320 |
// direction is blob.
|
|
1321 |
static BLOBNBOX* MutualUnusedHNeighbour(const BLOBNBOX* blob, |
|
1322 |
BlobNeighbourDir dir) { |
|
1323 |
BLOBNBOX* next_blob = blob->neighbour(dir); |
|
1324 |
if (next_blob == NULL || next_blob->owner() != NULL || |
|
1325 |
next_blob->UniquelyVertical()) |
|
1326 |
return NULL; |
|
1327 |
if (next_blob->neighbour(DirOtherWay(dir)) == blob) |
|
1328 |
return next_blob; |
|
1329 |
return NULL; |
|
1330 |
}
|
|
1331 |
||
1332 |
// Finds horizontal chains of text-like blobs and puts them in ColPartitions.
|
|
1333 |
void StrokeWidth::FindHorizontalTextChains(ColPartitionGrid* part_grid) { |
|
1334 |
BlobGridSearch gsearch(this); |
|
1335 |
BLOBNBOX* bbox; |
|
1336 |
gsearch.StartFullSearch(); |
|
1337 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
1338 |
BLOBNBOX* blob; |
|
1339 |
if (bbox->owner() == NULL && bbox->UniquelyHorizontal() && |
|
1340 |
(blob = MutualUnusedHNeighbour(bbox, BND_RIGHT)) != NULL) { |
|
1341 |
// Put all the linked blobs into a ColPartition.
|
|
1342 |
ColPartition* part = new ColPartition(BRT_TEXT, ICOORD(0, 1)); |
|
1343 |
part->AddBox(bbox); |
|
1344 |
while (blob != NULL) { |
|
1345 |
part->AddBox(blob); |
|
1346 |
blob = MutualUnusedHNeighbour(blob, BND_RIGHT); |
|
1347 |
}
|
|
1348 |
blob = MutualUnusedHNeighbour(bbox, BND_LEFT); |
|
1349 |
while (blob != NULL) { |
|
1350 |
part->AddBox(blob); |
|
1351 |
blob = MutualUnusedVNeighbour(blob, BND_LEFT); |
|
1352 |
}
|
|
1353 |
CompletePartition(part, part_grid); |
|
1354 |
}
|
|
1355 |
}
|
|
1356 |
}
|
|
1357 |
||
1358 |
// Finds diacritics and saves their base character in the blob.
|
|
1359 |
// The objective is to move all diacritics to the noise_blobs list, so
|
|
1360 |
// they don't mess up early textline finding/merging, or force splits
|
|
1361 |
// on textlines that overlap a bit. Blobs that become diacritics must be
|
|
1362 |
// either part of no ColPartition (NULL owner) or in a small partition in
|
|
1363 |
// which ALL the blobs are diacritics, in which case the partition is
|
|
1364 |
// exploded (deleted) back to its blobs.
|
|
1365 |
void StrokeWidth::TestDiacritics(ColPartitionGrid* part_grid, TO_BLOCK* block) { |
|
1366 |
BlobGrid small_grid(gridsize(), bleft(), tright()); |
|
1367 |
small_grid.InsertBlobList(&block->noise_blobs); |
|
1368 |
small_grid.InsertBlobList(&block->blobs); |
|
1369 |
int medium_diacritics = 0; |
|
1370 |
int small_diacritics = 0; |
|
1371 |
BLOBNBOX_IT small_it(&block->noise_blobs); |
|
1372 |
for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) { |
|
1373 |
BLOBNBOX* blob = small_it.data(); |
|
1374 |
if (blob->owner() == NULL && !blob->IsDiacritic() && |
|
1375 |
DiacriticBlob(&small_grid, blob)) { |
|
1376 |
++small_diacritics; |
|
1377 |
}
|
|
1378 |
}
|
|
1379 |
BLOBNBOX_IT blob_it(&block->blobs); |
|
1380 |
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
|
1381 |
BLOBNBOX* blob = blob_it.data(); |
|
1382 |
if (blob->IsDiacritic()) { |
|
1383 |
small_it.add_to_end(blob_it.extract()); |
|
1384 |
continue; // Already a diacritic. |
|
1385 |
}
|
|
1386 |
ColPartition* part = blob->owner(); |
|
1387 |
if (part == NULL && DiacriticBlob(&small_grid, blob)) { |
|
1388 |
++medium_diacritics; |
|
1389 |
RemoveBBox(blob); |
|
1390 |
small_it.add_to_end(blob_it.extract()); |
|
1391 |
} else if (part != NULL && !part->block_owned() && |
|
1392 |
part->boxes_count() < 3) { |
|
1393 |
// We allow blobs in small partitions to become diacritics if ALL the
|
|
1394 |
// blobs in the partition qualify as we can then cleanly delete the
|
|
1395 |
// partition, turn all the blobs in it to diacritics and they can be
|
|
1396 |
// merged into the base character partition more easily than merging
|
|
1397 |
// the partitions.
|
|
1398 |
BLOBNBOX_C_IT box_it(part->boxes()); |
|
1399 |
for (box_it.mark_cycle_pt(); !box_it.cycled_list() && |
|
1400 |
DiacriticBlob(&small_grid, box_it.data()); |
|
1401 |
box_it.forward()); |
|
1402 |
if (box_it.cycled_list()) { |
|
1403 |
// They are all good.
|
|
1404 |
while (!box_it.empty()) { |
|
1405 |
// Liberate the blob from its partition so it can be treated
|
|
1406 |
// as a diacritic and merged explicitly with the base part.
|
|
1407 |
// The blob is really owned by the block. The partition "owner"
|
|
1408 |
// is NULLed to allow the blob to get merged with its base character
|
|
1409 |
// partition.
|
|
1410 |
BLOBNBOX* box = box_it.extract(); |
|
1411 |
box->set_owner(NULL); |
|
1412 |
box_it.forward(); |
|
1413 |
++medium_diacritics; |
|
1414 |
// We remove the blob from the grid so it isn't found by subsequent
|
|
1415 |
// searches where we might not want to include diacritics.
|
|
1416 |
RemoveBBox(box); |
|
1417 |
}
|
|
1418 |
// We only move the one blob to the small list here, but the others
|
|
1419 |
// all get moved by the test at the top of the loop.
|
|
1420 |
small_it.add_to_end(blob_it.extract()); |
|
1421 |
part_grid->RemoveBBox(part); |
|
1422 |
delete part; |
|
1423 |
}
|
|
1424 |
} else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(), |
|
1425 |
blob->bounding_box().bottom())) { |
|
1426 |
tprintf("Blob not available to be a diacritic at:"); |
|
1427 |
blob->bounding_box().print(); |
|
1428 |
}
|
|
1429 |
}
|
|
1430 |
if (textord_tabfind_show_strokewidths) { |
|
1431 |
tprintf("Found %d small diacritics, %d medium\n", |
|
1432 |
small_diacritics, medium_diacritics); |
|
1433 |
}
|
|
1434 |
}
|
|
1435 |
||
1436 |
// Searches this grid for an appropriately close and sized neighbour of the
|
|
1437 |
// given [small] blob. If such a blob is found, the diacritic base is saved
|
|
1438 |
// in the blob and true is returned.
|
|
1439 |
// The small_grid is a secondary grid that contains the small/noise objects
|
|
1440 |
// that are not in this grid, but may be useful for determining a connection
|
|
1441 |
// between blob and its potential base character. (See DiacriticXGapFilled.)
|
|
1442 |
bool StrokeWidth::DiacriticBlob(BlobGrid* small_grid, BLOBNBOX* blob) { |
|
1443 |
if (BLOBNBOX::UnMergeableType(blob->region_type()) || |
|
1444 |
blob->region_type() == BRT_VERT_TEXT) |
|
1445 |
return false; |
|
1446 |
TBOX small_box(blob->bounding_box()); |
|
1447 |
bool debug = AlignedBlob::WithinTestRegion(2, small_box.left(), |
|
1448 |
small_box.bottom()); |
|
1449 |
if (debug) { |
|
1450 |
tprintf("Testing blob for diacriticness at:"); |
|
1451 |
small_box.print(); |
|
1452 |
}
|
|
1453 |
int x = (small_box.left() + small_box.right()) / 2; |
|
1454 |
int y = (small_box.bottom() + small_box.top()) / 2; |
|
1455 |
int grid_x, grid_y; |
|
1456 |
GridCoords(x, y, &grid_x, &grid_y); |
|
1457 |
int height = small_box.height(); |
|
1458 |
// Setup a rectangle search to find its nearest base-character neighbour.
|
|
1459 |
// We keep 2 different best candidates:
|
|
1460 |
// best_x_overlap is a category of base characters that have an overlap in x
|
|
1461 |
// (like a acute) in which we look for the least y-gap, computed using the
|
|
1462 |
// projection to favor base characters in the same textline.
|
|
1463 |
// best_y_overlap is a category of base characters that have no x overlap,
|
|
1464 |
// (nominally a y-overlap is preferrecd but not essential) in which we
|
|
1465 |
// look for the least weighted sum of x-gap and y-gap, with x-gap getting
|
|
1466 |
// a lower weight to catch quotes at the end of a textline.
|
|
1467 |
// NOTE that x-gap and y-gap are measured from the nearest side of the base
|
|
1468 |
// character to the FARTHEST side of the diacritic to allow small diacritics
|
|
1469 |
// to be a reasonable distance away, but not big diacritics.
|
|
1470 |
BLOBNBOX* best_x_overlap = NULL; |
|
1471 |
BLOBNBOX* best_y_overlap = NULL; |
|
1472 |
int best_total_dist = 0; |
|
1473 |
int best_y_gap = 0; |
|
1474 |
TBOX best_xbox; |
|
1475 |
// TODO(rays) the search box could be setup using the projection as a guide.
|
|
1476 |
TBOX search_box(small_box); |
|
1477 |
int x_pad = IntCastRounded(gridsize() * kDiacriticXPadRatio); |
|
1478 |
int y_pad = IntCastRounded(gridsize() * kDiacriticYPadRatio); |
|
1479 |
search_box.pad(x_pad, y_pad); |
|
1480 |
BlobGridSearch rsearch(this); |
|
1481 |
rsearch.SetUniqueMode(true); |
|
1482 |
int min_height = height * kMinDiacriticSizeRatio; |
|
1483 |
rsearch.StartRectSearch(search_box); |
|
1484 |
BLOBNBOX* neighbour; |
|
1485 |
while ((neighbour = rsearch.NextRectSearch()) != NULL) { |
|
1486 |
if (BLOBNBOX::UnMergeableType(neighbour->region_type()) || |
|
1487 |
neighbour == blob || neighbour->owner() == blob->owner()) |
|
1488 |
continue; |
|
1489 |
TBOX nbox = neighbour->bounding_box(); |
|
1490 |
if (neighbour->owner() == NULL || neighbour->owner()->IsVerticalType() || |
|
1491 |
(neighbour->flow() != BTFT_CHAIN && |
|
1492 |
neighbour->flow() != BTFT_STRONG_CHAIN)) { |
|
1493 |
if (debug) { |
|
1494 |
tprintf("Neighbour not strong enough:"); |
|
1495 |
nbox.print(); |
|
1496 |
}
|
|
1497 |
continue; // Diacritics must be attached to strong text. |
|
1498 |
}
|
|
1499 |
if (nbox.height() < min_height) { |
|
1500 |
if (debug) { |
|
1501 |
tprintf("Neighbour not big enough:"); |
|
1502 |
nbox.print(); |
|
1503 |
}
|
|
1504 |
continue; // Too small to be the base character. |
|
1505 |
}
|
|
1506 |
int x_gap = small_box.x_gap(nbox); |
|
1507 |
int y_gap = small_box.y_gap(nbox); |
|
1508 |
int total_distance = projection_->DistanceOfBoxFromBox(small_box, nbox, |
|
1509 |
true, denorm_, |
|
1510 |
debug); |
|
1511 |
if (debug) tprintf("xgap=%d, y=%d, total dist=%d\n", |
|
1512 |
x_gap, y_gap, total_distance); |
|
1513 |
if (total_distance > |
|
1514 |
neighbour->owner()->median_size() * kMaxDiacriticDistanceRatio) { |
|
1515 |
if (debug) { |
|
1516 |
tprintf("Neighbour with median size %d too far away:", |
|
1517 |
neighbour->owner()->median_size()); |
|
1518 |
neighbour->bounding_box().print(); |
|
1519 |
}
|
|
1520 |
continue; // Diacritics must not be too distant. |
|
1521 |
}
|
|
1522 |
if (x_gap <= 0) { |
|
1523 |
if (debug) { |
|
1524 |
tprintf("Computing reduced box for :"); |
|
1525 |
nbox.print(); |
|
1526 |
}
|
|
1527 |
int left = small_box.left() - small_box.width(); |
|
1528 |
int right = small_box.right() + small_box.width(); |
|
1529 |
nbox = neighbour->BoundsWithinLimits(left, right); |
|
1530 |
y_gap = small_box.y_gap(nbox); |
|
1531 |
if (best_x_overlap == NULL || y_gap < best_y_gap) { |
|
1532 |
best_x_overlap = neighbour; |
|
1533 |
best_xbox = nbox; |
|
1534 |
best_y_gap = y_gap; |
|
1535 |
if (debug) { |
|
1536 |
tprintf("New best:"); |
|
1537 |
nbox.print(); |
|
1538 |
}
|
|
1539 |
} else if (debug) { |
|
1540 |
tprintf("Shrunken box doesn't win:"); |
|
1541 |
nbox.print(); |
|
1542 |
}
|
|
1543 |
} else if (blob->ConfirmNoTabViolation(*neighbour)) { |
|
1544 |
if (best_y_overlap == NULL || total_distance < best_total_dist) { |
|
1545 |
if (debug) { |
|
1546 |
tprintf("New best y overlap:"); |
|
1547 |
nbox.print(); |
|
1548 |
}
|
|
1549 |
best_y_overlap = neighbour; |
|
1550 |
best_total_dist = total_distance; |
|
1551 |
} else if (debug) { |
|
1552 |
tprintf("New y overlap box doesn't win:"); |
|
1553 |
nbox.print(); |
|
1554 |
}
|
|
1555 |
} else if (debug) { |
|
1556 |
tprintf("Neighbour wrong side of a tab:"); |
|
1557 |
nbox.print(); |
|
1558 |
}
|
|
1559 |
}
|
|
1560 |
if (best_x_overlap != NULL && |
|
1561 |
(best_y_overlap == NULL || |
|
1562 |
best_xbox.major_y_overlap(best_y_overlap->bounding_box()))) { |
|
1563 |
blob->set_diacritic_box(best_xbox); |
|
1564 |
blob->set_base_char_blob(best_x_overlap); |
|
1565 |
if (debug) { |
|
1566 |
tprintf("DiacriticBlob OK! (x-overlap:"); |
|
1567 |
small_box.print(); |
|
1568 |
best_xbox.print(); |
|
1569 |
}
|
|
1570 |
return true; |
|
1571 |
}
|
|
1572 |
if (best_y_overlap != NULL && |
|
1573 |
DiacriticXGapFilled(small_grid, small_box, |
|
1574 |
best_y_overlap->bounding_box()) && |
|
1575 |
NoNoiseInBetween(small_box, best_y_overlap->bounding_box())) { |
|
1576 |
blob->set_diacritic_box(best_y_overlap->bounding_box()); |
|
1577 |
blob->set_base_char_blob(best_y_overlap); |
|
1578 |
if (debug) { |
|
1579 |
tprintf("DiacriticBlob OK! (y-overlap:"); |
|
1580 |
small_box.print(); |
|
1581 |
best_y_overlap->bounding_box().print(); |
|
1582 |
}
|
|
1583 |
return true; |
|
1584 |
}
|
|
1585 |
if (debug) { |
|
1586 |
tprintf("DiacriticBlob fails:"); |
|
1587 |
small_box.print(); |
|
1588 |
tprintf("Best x+y gap = %d, y = %d\n", best_total_dist, best_y_gap); |
|
1589 |
if (best_y_overlap != NULL) { |
|
1590 |
tprintf("XGapFilled=%d, NoiseBetween=%d\n", |
|
1591 |
DiacriticXGapFilled(small_grid, small_box, |
|
1592 |
best_y_overlap->bounding_box()), |
|
1593 |
NoNoiseInBetween(small_box, best_y_overlap->bounding_box())); |
|
1594 |
}
|
|
1595 |
}
|
|
1596 |
return false; |
|
1597 |
}
|
|
1598 |
||
1599 |
// Returns true if there is no gap between the base char and the diacritic
|
|
1600 |
// bigger than a fraction of the height of the base char:
|
|
1601 |
// Eg: line end.....'
|
|
1602 |
// The quote is a long way from the end of the line, yet it needs to be a
|
|
1603 |
// diacritic. To determine that the quote is not part of an image, or
|
|
1604 |
// a different text block, we check for other marks in the gap between
|
|
1605 |
// the base char and the diacritic.
|
|
1606 |
// '<--Diacritic
|
|
1607 |
// |---------|
|
|
1608 |
// | |<-toobig-gap->
|
|
1609 |
// | Base |<ok gap>
|
|
1610 |
// |---------| x<-----Dot occupying gap
|
|
1611 |
// The grid is const really.
|
|
1612 |
bool StrokeWidth::DiacriticXGapFilled(BlobGrid* grid, |
|
1613 |
const TBOX& diacritic_box, |
|
1614 |
const TBOX& base_box) { |
|
1615 |
// Since most gaps are small, use an iterative algorithm to search the gap.
|
|
1616 |
int max_gap = IntCastRounded(base_box.height() * |
|
1617 |
kMaxDiacriticGapToBaseCharHeight); |
|
1618 |
TBOX occupied_box(base_box); |
|
1619 |
int diacritic_gap; |
|
1620 |
while ((diacritic_gap = diacritic_box.x_gap(occupied_box)) > max_gap) { |
|
1621 |
TBOX search_box(occupied_box); |
|
1622 |
if (diacritic_box.left() > search_box.right()) { |
|
1623 |
// We are looking right.
|
|
1624 |
search_box.set_left(search_box.right()); |
|
1625 |
search_box.set_right(search_box.left() + max_gap); |
|
1626 |
} else { |
|
1627 |
// We are looking left.
|
|
1628 |
search_box.set_right(search_box.left()); |
|
1629 |
search_box.set_left(search_box.left() - max_gap); |
|
1630 |
}
|
|
1631 |
BlobGridSearch rsearch(grid); |
|
1632 |
rsearch.StartRectSearch(search_box); |
|
1633 |
BLOBNBOX* neighbour; |
|
1634 |
while ((neighbour = rsearch.NextRectSearch()) != NULL) { |
|
1635 |
const TBOX& nbox = neighbour->bounding_box(); |
|
1636 |
if (nbox.x_gap(diacritic_box) < diacritic_gap) { |
|
1637 |
if (nbox.left() < occupied_box.left()) |
|
1638 |
occupied_box.set_left(nbox.left()); |
|
1639 |
if (nbox.right() > occupied_box.right()) |
|
1640 |
occupied_box.set_right(nbox.right()); |
|
1641 |
break; |
|
1642 |
}
|
|
1643 |
}
|
|
1644 |
if (neighbour == NULL) |
|
1645 |
return false; // Found a big gap. |
|
1646 |
}
|
|
1647 |
return true; // The gap was filled. |
|
1648 |
}
|
|
1649 |
||
1650 |
// Merges diacritics with the ColPartition of the base character blob.
|
|
1651 |
void StrokeWidth::MergeDiacritics(TO_BLOCK* block, |
|
1652 |
ColPartitionGrid* part_grid) { |
|
1653 |
BLOBNBOX_IT small_it(&block->noise_blobs); |
|
1654 |
for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) { |
|
1655 |
BLOBNBOX* blob = small_it.data(); |
|
1656 |
if (blob->base_char_blob() != NULL) { |
|
1657 |
ColPartition* part = blob->base_char_blob()->owner(); |
|
1658 |
// The base character must be owned by a partition and that partition
|
|
1659 |
// must not be on the big_parts list (not block owned).
|
|
1660 |
if (part != NULL && !part->block_owned() && blob->owner() == NULL && |
|
1661 |
blob->IsDiacritic()) { |
|
1662 |
// The partition has to be removed from the grid and reinserted
|
|
1663 |
// because its bounding box may change.
|
|
1664 |
part_grid->RemoveBBox(part); |
|
1665 |
part->AddBox(blob); |
|
1666 |
blob->set_region_type(part->blob_type()); |
|
1667 |
blob->set_flow(part->flow()); |
|
1668 |
blob->set_owner(part); |
|
1669 |
part_grid->InsertBBox(true, true, part); |
|
1670 |
}
|
|
1671 |
// Set all base chars to NULL before any blobs get deleted.
|
|
1672 |
blob->set_base_char_blob(NULL); |
|
1673 |
}
|
|
1674 |
}
|
|
1675 |
}
|
|
1676 |
||
1677 |
// Any blobs on the large_blobs list of block that are still unowned by a
|
|
1678 |
// ColPartition, are probably drop-cap or vertically touching so the blobs
|
|
1679 |
// are removed to the big_parts list and treated separately.
|
|
1680 |
void StrokeWidth::RemoveLargeUnusedBlobs(TO_BLOCK* block, |
|
1681 |
ColPartitionGrid* part_grid, |
|
1682 |
ColPartition_LIST* big_parts) { |
|
1683 |
BLOBNBOX_IT large_it(&block->large_blobs); |
|
199
by theraysmith
Changes to textord for 3.00 |
1684 |
for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) { |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
1685 |
BLOBNBOX* blob = large_it.data(); |
1686 |
ColPartition* big_part = blob->owner(); |
|
1687 |
if (big_part == NULL) { |
|
1688 |
// Large blobs should have gone into partitions by now if they are
|
|
1689 |
// genuine characters, so move any unowned ones out to the big parts
|
|
1690 |
// list. This will include drop caps and vertically touching characters.
|
|
1691 |
ColPartition::MakeBigPartition(blob, big_parts); |
|
1692 |
}
|
|
1693 |
}
|
|
1694 |
}
|
|
1695 |
||
1696 |
// All remaining unused blobs are put in individual ColPartitions.
|
|
1697 |
void StrokeWidth::PartitionRemainingBlobs(ColPartitionGrid* part_grid) { |
|
1698 |
BlobGridSearch gsearch(this); |
|
1699 |
BLOBNBOX* bbox; |
|
1700 |
int prev_grid_x = -1; |
|
1701 |
int prev_grid_y = -1; |
|
1702 |
BLOBNBOX_CLIST cell_list; |
|
1703 |
BLOBNBOX_C_IT cell_it(&cell_list); |
|
1704 |
bool cell_all_noise = true; |
|
1705 |
gsearch.StartFullSearch(); |
|
1706 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
1707 |
int grid_x = gsearch.GridX(); |
|
1708 |
int grid_y = gsearch.GridY(); |
|
1709 |
if (grid_x != prev_grid_x || grid_y != prev_grid_y) { |
|
1710 |
// New cell. Process old cell.
|
|
1711 |
MakePartitionsFromCellList(cell_all_noise, part_grid, &cell_list); |
|
1712 |
cell_it.set_to_list(&cell_list); |
|
1713 |
prev_grid_x = grid_x; |
|
1714 |
prev_grid_y = grid_y; |
|
1715 |
cell_all_noise = true; |
|
1716 |
}
|
|
1717 |
if (bbox->owner() == NULL) { |
|
1718 |
cell_it.add_to_end(bbox); |
|
1719 |
if (bbox->flow() != BTFT_NONTEXT) |
|
1720 |
cell_all_noise = false; |
|
1721 |
} else { |
|
1722 |
cell_all_noise = false; |
|
1723 |
}
|
|
1724 |
}
|
|
1725 |
MakePartitionsFromCellList(cell_all_noise, part_grid, &cell_list); |
|
1726 |
}
|
|
1727 |
||
1728 |
// If combine, put all blobs in the cell_list into a single partition, otherwise
|
|
1729 |
// put each one into its own partition.
|
|
1730 |
void StrokeWidth::MakePartitionsFromCellList(bool combine, |
|
1731 |
ColPartitionGrid* part_grid, |
|
1732 |
BLOBNBOX_CLIST* cell_list) { |
|
1733 |
if (cell_list->empty()) |
|
1734 |
return; |
|
1735 |
BLOBNBOX_C_IT cell_it(cell_list); |
|
1736 |
if (combine) { |
|
1737 |
BLOBNBOX* bbox = cell_it.extract(); |
|
1738 |
ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1)); |
|
1739 |
part->AddBox(bbox); |
|
1740 |
part->set_flow(bbox->flow()); |
|
1741 |
for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) { |
|
1742 |
part->AddBox(cell_it.extract()); |
|
1743 |
}
|
|
1744 |
CompletePartition(part, part_grid); |
|
1745 |
} else { |
|
1746 |
for (; !cell_it.empty(); cell_it.forward()) { |
|
1747 |
BLOBNBOX* bbox = cell_it.extract(); |
|
1748 |
ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1)); |
|
1749 |
part->set_flow(bbox->flow()); |
|
1750 |
part->AddBox(bbox); |
|
1751 |
CompletePartition(part, part_grid); |
|
1752 |
}
|
|
1753 |
}
|
|
1754 |
}
|
|
1755 |
||
1756 |
// Helper function to finish setting up a ColPartition and insert into
|
|
1757 |
// part_grid.
|
|
1758 |
void StrokeWidth::CompletePartition(ColPartition* part, |
|
1759 |
ColPartitionGrid* part_grid) { |
|
1760 |
part->ComputeLimits(); |
|
1761 |
TBOX box = part->bounding_box(); |
|
1762 |
bool debug = AlignedBlob::WithinTestRegion(2, box.left(), |
|
1763 |
box.bottom()); |
|
1764 |
int value = projection_->EvaluateColPartition(*part, denorm_, debug); |
|
1765 |
part->SetRegionAndFlowTypesFromProjectionValue(value); |
|
1766 |
part->ClaimBoxes(); |
|
1767 |
part_grid->InsertBBox(true, true, part); |
|
1768 |
}
|
|
1769 |
||
1770 |
// Merge partitions where the merge appears harmless.
|
|
1771 |
// As this
|
|
1772 |
void StrokeWidth::EasyMerges(ColPartitionGrid* part_grid) { |
|
1773 |
part_grid->Merges( |
|
1774 |
NewPermanentTessCallback(this, &StrokeWidth::OrientationSearchBox), |
|
1775 |
NewPermanentTessCallback(this, &StrokeWidth::ConfirmEasyMerge)); |
|
1776 |
}
|
|
1777 |
||
1778 |
// Compute a search box based on the orientation of the partition.
|
|
1779 |
// Returns true if a suitable box can be calculated.
|
|
1780 |
// Callback for EasyMerges.
|
|
1781 |
bool StrokeWidth::OrientationSearchBox(ColPartition* part, TBOX* box) { |
|
1782 |
if (part->IsVerticalType()) { |
|
1783 |
box->set_top(box->top() + box->width()); |
|
1784 |
box->set_bottom(box->bottom() - box->width()); |
|
1785 |
} else { |
|
1786 |
box->set_left(box->left() - box->height()); |
|
1787 |
box->set_right(box->right() + box->height()); |
|
1788 |
}
|
|
1789 |
return true; |
|
1790 |
}
|
|
1791 |
||
1792 |
// Merge confirmation callback for EasyMerges.
|
|
1793 |
bool StrokeWidth::ConfirmEasyMerge(const ColPartition* p1, |
|
1794 |
const ColPartition* p2) { |
|
1795 |
ASSERT_HOST(p1 != NULL && p2 != NULL); |
|
1796 |
ASSERT_HOST(!p1->IsEmpty() && !p2->IsEmpty()); |
|
1797 |
if ((p1->flow() == BTFT_NONTEXT && p2->flow() >= BTFT_CHAIN) || |
|
1798 |
(p1->flow() >= BTFT_CHAIN && p2->flow() == BTFT_NONTEXT)) |
|
1799 |
return false; // Don't merge confirmed image with text. |
|
1800 |
if ((p1->IsVerticalType() || p2->IsVerticalType()) && |
|
1801 |
p1->HCoreOverlap(*p2) <= 0 && |
|
1802 |
((!p1->IsSingleton() && |
|
1803 |
!p2->IsSingleton()) || |
|
1804 |
!p1->bounding_box().major_overlap(p2->bounding_box()))) |
|
1805 |
return false; // Overlap must be in the text line. |
|
1806 |
if ((p1->IsHorizontalType() || p2->IsHorizontalType()) && |
|
1807 |
p1->VCoreOverlap(*p2) <= 0 && |
|
1808 |
((!p1->IsSingleton() && |
|
1809 |
!p2->IsSingleton()) || |
|
1810 |
(!p1->bounding_box().major_overlap(p2->bounding_box()) && |
|
1811 |
!p1->OKDiacriticMerge(*p2, false) && |
|
1812 |
!p2->OKDiacriticMerge(*p1, false)))) |
|
1813 |
return false; // Overlap must be in the text line. |
|
1814 |
if (!p1->ConfirmNoTabViolation(*p2)) |
|
1815 |
return false; |
|
1816 |
if (p1->flow() <= BTFT_NONTEXT && p2->flow() <= BTFT_NONTEXT) |
|
1817 |
return true; |
|
1818 |
return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box()); |
|
1819 |
}
|
|
1820 |
||
1821 |
// Returns true if there is no significant noise in between the boxes.
|
|
1822 |
bool StrokeWidth::NoNoiseInBetween(const TBOX& box1, const TBOX& box2) const { |
|
1823 |
return ImageFind::BlankImageInBetween(box1, box2, grid_box_, rerotation_, |
|
1824 |
nontext_map_); |
|
199
by theraysmith
Changes to textord for 3.00 |
1825 |
}
|
1826 |
||
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
1827 |
/** Displays the blobs colored according to the number of good neighbours
|
1828 |
* and the vertical/horizontal flow.
|
|
1829 |
*/
|
|
199
by theraysmith
Changes to textord for 3.00 |
1830 |
ScrollView* StrokeWidth::DisplayGoodBlobs(const char* window_name, |
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
1831 |
int x, int y) { |
1832 |
ScrollView* window = NULL; |
|
199
by theraysmith
Changes to textord for 3.00 |
1833 |
#ifndef GRAPHICS_DISABLED
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
1834 |
window = MakeWindow(x, y, window_name); |
199
by theraysmith
Changes to textord for 3.00 |
1835 |
// For every blob in the grid, display it.
|
1836 |
window->Brush(ScrollView::NONE); |
|
1837 |
||
1838 |
// For every bbox in the grid, display it.
|
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
1839 |
BlobGridSearch gsearch(this); |
199
by theraysmith
Changes to textord for 3.00 |
1840 |
gsearch.StartFullSearch(); |
1841 |
BLOBNBOX* bbox; |
|
1842 |
while ((bbox = gsearch.NextFullSearch()) != NULL) { |
|
1843 |
TBOX box = bbox->bounding_box(); |
|
1844 |
int left_x = box.left(); |
|
1845 |
int right_x = box.right(); |
|
1846 |
int top_y = box.top(); |
|
1847 |
int bottom_y = box.bottom(); |
|
381
by zdenop at gmail
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process |
1848 |
int goodness = bbox->GoodTextBlob(); |
1849 |
BlobRegionType blob_type = bbox->region_type(); |
|
1850 |
if (bbox->UniquelyVertical()) |
|
1851 |
blob_type = BRT_VERT_TEXT; |
|
1852 |
if (bbox->UniquelyHorizontal()) |
|
1853 |
blob_type = BRT_TEXT; |
|
1854 |
BlobTextFlowType flow = bbox->flow(); |
|
1855 |
if (flow == BTFT_NONE) { |
|
1856 |
if (goodness == 0) |
|
1857 |
flow = BTFT_NEIGHBOURS; |
|
1858 |
else if (goodness == 1) |
|
1859 |
flow = BTFT_CHAIN; |
|
1860 |
else
|
|
1861 |
flow = BTFT_STRONG_CHAIN; |
|
1862 |
}
|
|
1863 |
window->Pen(BLOBNBOX::TextlineColor(blob_type, flow)); |
|
199
by theraysmith
Changes to textord for 3.00 |
1864 |
window->Rectangle(left_x, bottom_y, right_x, top_y); |
1865 |
}
|
|
1866 |
window->Update(); |
|
1867 |
#endif
|
|
1868 |
return window; |
|
1869 |
}
|
|
1870 |
||
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
1871 |
static void DrawDiacriticJoiner(const BLOBNBOX* blob, ScrollView* window) { |
751
by zdenop
fix build with -DGRAPHICS_DISABLED |
1872 |
#ifndef GRAPHICS_DISABLED
|
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
1873 |
const TBOX& blob_box(blob->bounding_box()); |
1874 |
int top = MAX(blob_box.top(), blob->base_char_top()); |
|
1875 |
int bottom = MIN(blob_box.bottom(), blob->base_char_bottom()); |
|
1876 |
int x = (blob_box.left() + blob_box.right()) / 2; |
|
1877 |
window->Line(x, top, x, bottom); |
|
751
by zdenop
fix build with -DGRAPHICS_DISABLED |
1878 |
#endif // GRAPHICS_DISABLED |
482
by theraysmith at gmail
Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding |
1879 |
}
|
1880 |
||
1881 |
// Displays blobs colored according to whether or not they are diacritics.
|
|
1882 |
ScrollView* StrokeWidth::DisplayDiacritics(const char* window_name, |
|
1883 |
int x, int y, TO_BLOCK* block) { |
|
1884 |
ScrollView* window = NULL; |
|
1885 |
#ifndef GRAPHICS_DISABLED
|
|
1886 |
window = MakeWindow(x, y, window_name); |
|
1887 |
// For every blob in the grid, display it.
|
|
1888 |
window->Brush(ScrollView::NONE); |
|
1889 |
||
1890 |
BLOBNBOX_IT it(&block->blobs); |
|
1891 |
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
|
1892 |
BLOBNBOX* blob = it.data(); |
|
1893 |
if (blob->IsDiacritic()) { |
|
1894 |
window->Pen(ScrollView::GREEN); |
|
1895 |
DrawDiacriticJoiner(blob, window); |
|
1896 |
} else { |
|
1897 |
window->Pen(blob->BoxColor()); |
|
1898 |
}
|
|
1899 |
const TBOX& box = blob->bounding_box(); |
|
1900 |
window->Rectangle(box.left(), box. bottom(), box.right(), box.top()); |
|
1901 |
}
|
|
1902 |
it.set_to_list(&block->noise_blobs); |
|
1903 |
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
|
1904 |
BLOBNBOX* blob = it.data(); |
|
1905 |
if (blob->IsDiacritic()) { |
|
1906 |
window->Pen(ScrollView::GREEN); |
|
1907 |
DrawDiacriticJoiner(blob, window); |
|
1908 |
} else { |
|
1909 |
window->Pen(ScrollView::WHITE); |
|
1910 |
}
|
|
1911 |
const TBOX& box = blob->bounding_box(); |
|
1912 |
window->Rectangle(box.left(), box. bottom(), box.right(), box.top()); |
|
1913 |
}
|
|
1914 |
window->Update(); |
|
1915 |
#endif
|
|
1916 |
return window; |
|
199
by theraysmith
Changes to textord for 3.00 |
1917 |
}
|
1918 |
||
1919 |
} // namespace tesseract. |