157
// If tesseract is to be run, sets the words up ready for it.
158
void Tesseract::SetupAllWordsPassN(int pass_n,
159
const TBOX* target_word_box,
160
const char* word_config,
162
GenericVector<WordData>* words) {
163
// Prepare all the words.
164
PAGE_RES_IT page_res_it(page_res);
165
for (page_res_it.restart_page(); page_res_it.word() != NULL;
166
page_res_it.forward()) {
168
page_res_it.word()->SetupFake(unicharset);
169
if (target_word_box == NULL ||
170
ProcessTargetWord(page_res_it.word()->word->bounding_box(),
171
*target_word_box, word_config, 1)) {
172
words->push_back(WordData(page_res_it));
175
// Setup all the words for recognition with polygonal approximation.
176
for (int w = 0; w < words->size(); ++w) {
177
SetupWordPassN(pass_n, &(*words)[w]);
178
if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
182
// Sets up the single word ready for whichever engine is to be run.
183
void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
184
if (pass_n == 1 || !word->word->done || tessedit_training_tess) {
186
// TODO(rays) Should we do this on pass1 too?
187
word->word->caps_height = 0.0;
188
if (word->word->x_height == 0.0f)
189
word->word->x_height = word->row->x_height();
191
// Cube doesn't get setup for pass2.
192
if (pass_n != 2 || tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
193
word->word->SetupForRecognition(
194
unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL,
195
classify_bln_numeric_mode, textord_use_cjk_fp_model,
196
poly_allow_detailed_fx, word->row, word->block);
199
if (!sub_langs_.empty()) {
200
if (word->lang_words.size() != sub_langs_.size()) {
201
// Setup the words for all the sub-languages now.
203
word->lang_words.init_to_size(sub_langs_.size(), empty);
205
for (int s = 0; s < sub_langs_.size(); ++s) {
206
Tesseract* lang_t = sub_langs_[s];
207
if (pass_n == 1 || (lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY &&
208
(!word->lang_words[s].done || lang_t->tessedit_training_tess))) {
209
word->lang_words[s].InitForRetryRecognition(*word->word);
210
word->lang_words[s].SetupForRecognition(
211
lang_t->unicharset, lang_t, BestPix(),
212
lang_t->tessedit_ocr_engine_mode, NULL,
213
lang_t->classify_bln_numeric_mode,
214
lang_t->textord_use_cjk_fp_model,
215
lang_t->poly_allow_detailed_fx, word->row, word->block);
222
// Runs word recognition on all the words.
223
bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
224
GenericVector<WordData>* words) {
225
// TODO(rays) Before this loop can be parallelized (it would yield a massive
226
// speed-up) all remaining member globals need to be converted to local/heap
227
// (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
228
// added. The results will be significantly different with adaption on, and
229
// deterioration will need investigation.
230
for (int w = 0; w < words->size(); ++w) {
231
WordData* word = &(*words)[w];
232
if (monitor != NULL) {
233
monitor->ocr_alive = TRUE;
235
monitor->progress = 30 + 50 * w / words->size();
237
monitor->progress = 80 + 10 * w / words->size();
238
if (monitor->deadline_exceeded() ||
239
(monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
241
// Timeout. Fake out the rest of the words.
242
for (; w < words->size(); ++w) {
243
(*words)[w].word->SetupFake(unicharset);
248
if (word->word->tess_failed) continue;
249
WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
250
: &Tesseract::classify_word_pass2;
251
classify_word_and_language(recognizer, word);
252
if (tessedit_dump_choices) {
253
word_dumper(NULL, word->row, word->word);
254
tprintf("Pass%d: %s [%s]\n", pass_n,
255
word->word->best_choice->unichar_string().string(),
256
word->word->best_choice->debug_string().string());
157
263
* recog_all_words()
179
285
const TBOX* target_word_box,
180
286
const char* word_config,
182
PAGE_RES_IT page_res_it;
183
inT32 word_index; // current word
288
PAGE_RES_IT page_res_it(page_res);
185
290
if (tessedit_minimal_rej_pass1) {
186
291
tessedit_test_adaption.set_value (TRUE);
187
292
tessedit_minimal_rejection.set_value (TRUE);
190
// Before the main recognition loop below, walk through the whole page and set
191
// up fake words. That way, if we run out of time a user will still get the
192
// expected best_choice and box_words out the end; they'll just be empty.
193
page_res_it.page_res = page_res;
194
for (page_res_it.restart_page(); page_res_it.word() != NULL;
195
page_res_it.forward()) {
196
page_res_it.word()->SetupFake(unicharset);
199
295
if (dopasses==0 || dopasses==1) {
200
page_res_it.page_res=page_res;
201
296
page_res_it.restart_page();
203
297
// ****************** Pass 1 *******************
205
299
// Clear adaptive classifier at the beginning of the page if it is full.
214
308
if (sub_langs_[i]->AdaptiveClassifierIsFull())
215
309
sub_langs_[i]->ResetAdaptiveClassifierInternal();
218
stats_.word_count = 0;
219
if (monitor != NULL) {
220
monitor->ocr_alive = TRUE;
221
while (page_res_it.word() != NULL) {
223
page_res_it.forward();
225
page_res_it.restart_page();
227
stats_.word_count = 1;
311
// Set up all words ready for recognition, so that if parallelism is on
312
// all the input and output classes are ready to run the classifier.
313
GenericVector<WordData> words;
314
SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
315
if (tessedit_parallelize) {
316
PrerecAllWordsPar(words);
319
stats_.word_count = words.size();
232
321
stats_.dict_words = 0;
233
322
stats_.doc_blob_quality = 0;
237
326
stats_.doc_good_char_quality = 0;
239
328
most_recently_used_ = this;
329
// Run pass 1 word recognition.
330
if (!RecogAllWordsPassN(1, monitor, &words)) return false;
331
// Pass 1 post-processing.
240
332
while (page_res_it.word() != NULL) {
241
set_global_loc_code(LOC_PASS1);
243
if (monitor != NULL) {
244
monitor->ocr_alive = TRUE;
245
monitor->progress = 30 + 50 * word_index / stats_.word_count;
246
if (monitor->deadline_exceeded() ||
247
(monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
251
if (target_word_box &&
252
!ProcessTargetWord(page_res_it.word()->word->bounding_box(),
253
*target_word_box, word_config, 1)) {
254
page_res_it.forward();
257
classify_word_and_language(&Tesseract::classify_word_pass1,
258
page_res_it.block()->block,
259
page_res_it.row()->row,
261
333
if (page_res_it.word()->word->flag(W_REP_CHAR)) {
262
334
fix_rep_char(&page_res_it);
263
335
page_res_it.forward();
266
if (tessedit_dump_choices) {
267
word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
268
tprintf("Pass1: %s [%s]\n",
269
page_res_it.word()->best_choice->unichar_string().string(),
270
page_res_it.word()->best_choice->debug_string().string());
273
// tessedit_test_adaption enables testing of the accuracy of the
274
// input to the adaptive classifier.
275
if (tessedit_test_adaption && !tessedit_minimal_rejection) {
276
if (!word_adaptable (page_res_it.word(),
277
tessedit_test_adaption_mode)) {
278
page_res_it.word()->reject_map.rej_word_tess_failure();
281
// Override rejection mechanisms for this word.
282
UNICHAR_ID space = unicharset.unichar_to_id(" ");
283
for (int i = 0; i < page_res_it.word()->best_choice->length(); i++) {
284
if ((page_res_it.word()->best_choice->unichar_id(i) != space) &&
285
page_res_it.word()->reject_map[i].rejected())
286
page_res_it.word()->reject_map[i].setrej_minimal_rej_accept();
291
339
// Count dict words.
292
340
if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
307
355
if (dopasses == 1) return true;
309
357
// ****************** Pass 2 *******************
310
page_res_it.restart_page();
312
most_recently_used_ = this;
313
while (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
314
page_res_it.word() != NULL) {
315
set_global_loc_code(LOC_PASS2);
317
if (monitor != NULL) {
318
monitor->ocr_alive = TRUE;
319
monitor->progress = 80 + 10 * word_index / stats_.word_count;
320
if (monitor->deadline_exceeded() ||
321
(monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
326
// changed by jetsoft
327
// specific to its needs to extract one word when need
328
if (target_word_box &&
329
!ProcessTargetWord(page_res_it.word()->word->bounding_box(),
330
*target_word_box, word_config, 2)) {
331
page_res_it.forward();
336
classify_word_and_language(&Tesseract::classify_word_pass2,
337
page_res_it.block()->block,
338
page_res_it.row()->row,
340
if (page_res_it.word()->word->flag(W_REP_CHAR) &&
341
!page_res_it.word()->done) {
342
fix_rep_char(&page_res_it);
343
page_res_it.forward();
346
if (tessedit_dump_choices) {
347
word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
348
tprintf("Pass2: %s [%s]\n",
349
page_res_it.word()->best_choice->unichar_string().string(),
350
page_res_it.word()->best_choice->debug_string().string());
352
page_res_it.forward();
358
if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption) {
359
page_res_it.restart_page();
360
GenericVector<WordData> words;
361
SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
362
if (tessedit_parallelize) {
363
PrerecAllWordsPar(words);
365
most_recently_used_ = this;
366
// Run pass 2 word recognition.
367
if (!RecogAllWordsPassN(2, monitor, &words)) return false;
368
// Pass 2 post-processing.
369
while (page_res_it.word() != NULL) {
370
WERD_RES* word = page_res_it.word();
371
if (word->word->flag(W_REP_CHAR) && !word->done) {
372
fix_rep_char(&page_res_it);
373
page_res_it.forward();
376
page_res_it.forward();
355
380
// The next passes can only be run if tesseract has been used, as cube
701
// Sets script positions and detects smallcaps on all output words.
702
void Tesseract::script_pos_pass(PAGE_RES* page_res) {
703
PAGE_RES_IT page_res_it(page_res);
704
for (page_res_it.restart_page(); page_res_it.word() != NULL;
705
page_res_it.forward()) {
706
WERD_RES* word = page_res_it.word();
707
if (word->word->flag(W_REP_CHAR)) {
708
page_res_it.forward();
711
float x_height = page_res_it.block()->block->x_height();
712
float word_x_height = word->x_height;
713
if (word_x_height < word->best_choice->min_x_height() ||
714
word_x_height > word->best_choice->max_x_height()) {
715
word_x_height = (word->best_choice->min_x_height() +
716
word->best_choice->max_x_height()) / 2.0f;
718
// Test for small caps. Word capheight must be close to block xheight,
719
// and word must contain no lower case letters, and at least one upper case.
720
double small_cap_xheight = x_height * kXHeightCapRatio;
721
double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
722
if (word->uch_set->script_has_xheight() &&
723
small_cap_xheight - small_cap_delta <= word_x_height &&
724
word_x_height <= small_cap_xheight + small_cap_delta) {
725
// Scan for upper/lower.
728
for (int i = 0; i < word->best_choice->length(); ++i) {
729
if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
731
else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
734
if (num_upper > 0 && num_lower == 0)
735
word->small_caps = true;
737
word->SetScriptPositions();
675
741
// Helper returns true if the new_word is better than the word, using a
676
742
// simple test of better certainty AND rating (to reduce false positives
677
743
// from cube) or a dictionary vs non-dictionary word.
702
768
// Helper to recognize the word using the given (language-specific) tesseract.
703
769
// Returns true if the result was better than previously.
704
bool Tesseract::RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row,
770
bool Tesseract::RetryWithLanguage(const WERD_RES& best_word,
771
WordData* word_data, WERD_RES* word,
705
772
WordRecognizer recognizer) {
706
773
if (classify_debug_level || cube_debug_level) {
707
774
tprintf("Retrying word using lang %s, oem %d\n",
708
775
lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
710
// Setup a trial WERD_RES in which to classify.
712
lang_word.InitForRetryRecognition(*word);
713
777
// Run the recognizer on the word.
714
778
// Initial version is a bit of a hack based on better certainty and rating
715
779
// (to reduce false positives from cube) or a dictionary vs non-dictionary
717
(this->*recognizer)(block, row, &lang_word);
718
bool new_is_better = NewWordBetter(*word, lang_word,
781
(this->*recognizer)(word_data, word);
782
bool new_is_better = NewWordBetter(best_word, *word,
719
783
classify_max_rating_ratio,
720
784
classify_max_certainty_margin);
721
785
if (classify_debug_level || cube_debug_level) {
722
if (lang_word.best_choice == NULL) {
723
tprintf("New result %s better:%s\n",
786
if (word->best_choice == NULL) {
787
tprintf("NULL result %s better!\n",
724
788
new_is_better ? "IS" : "NOT");
726
790
tprintf("New result %s better:%s, r=%g, c=%g\n",
727
791
new_is_better ? "IS" : "NOT",
728
lang_word.best_choice->unichar_string().string(),
729
lang_word.best_choice->rating(),
730
lang_word.best_choice->certainty());
792
word->best_choice->unichar_string().string(),
793
word->best_choice->rating(),
794
word->best_choice->certainty());
734
word->ConsumeWordResults(&lang_word);
736
797
return new_is_better;
755
816
const char* result_type = "Initial";
756
817
bool initially_done = !word->tess_failed && word->done;
757
818
if (initially_done) {
758
// If done on pass1, we reuse the tesseract that did it, and don't try
759
// any more. The only need to call the classifier at all is for the
760
// cube combiner and xheight fixing (which may be bogus on a done word.)
819
// If done on pass1, leave it as-is.
761
820
most_recently_used_ = word->tesseract;
762
821
result_type = "Already done";
823
if (most_recently_used_ != this) {
824
// Point to the word for most_recently_used_.
825
for (int s = 0; s < sub_langs_.size(); ++s) {
826
if (most_recently_used_ == sub_langs_[s]) {
827
word = &word_data->lang_words[s];
832
(most_recently_used_->*recognizer)(word_data, word);
833
if (!word->tess_failed && word->tess_accepted)
834
result_type = "Accepted";
764
(most_recently_used_->*recognizer)(block, row, word);
765
if (!word->tess_failed && word->tess_accepted)
766
result_type = "Accepted";
767
836
if (classify_debug_level || cube_debug_level) {
768
837
tprintf("%s result: %s r=%.4g, c=%.4g, accepted=%d, adaptable=%d"
769
838
" xht=[%g,%g]\n",
782
851
if (classify_debug_level) {
783
852
tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string());
785
if (RetryWithLanguage(word, block, row, recognizer)) {
786
most_recently_used_ = this;
787
if (!word->tess_failed && word->tess_accepted)
788
return; // No need to look at the others.
854
if (word_data->word->tesseract == this) {
855
// This is pass1, and we are trying the main language.
856
if (RetryWithLanguage(*word, word_data, word_data->word, recognizer)) {
857
most_recently_used_ = this;
858
word = word_data->word;
861
// This is pass2, and we are trying the main language again, but it
862
// has no word allocated to it, so we must re-initialize it.
863
WERD_RES main_word(*word_data->word);
864
main_word.InitForRetryRecognition(*word_data->word);
865
main_word.SetupForRecognition(unicharset, this, BestPix(),
866
tessedit_ocr_engine_mode, NULL,
867
classify_bln_numeric_mode,
868
textord_use_cjk_fp_model,
869
poly_allow_detailed_fx,
870
word_data->row, word_data->block);
871
if (RetryWithLanguage(*word, word_data, &main_word, recognizer)) {
872
most_recently_used_ = this;
873
word_data->word->ConsumeWordResults(&main_word);
874
word = word_data->word;
877
if (!word->tess_failed && word->tess_accepted)
878
return; // No need to look at the others.
792
881
for (int i = 0; i < sub_langs_.size(); ++i) {
795
884
tprintf("Retrying with sub-Tesseract[%d] lang: %s\n",
796
885
i, sub_langs_[i]->lang.string());
798
if (sub_langs_[i]->RetryWithLanguage(word, block, row, recognizer)) {
887
if (sub_langs_[i]->RetryWithLanguage(*word, word_data,
888
&word_data->lang_words[i],
799
890
most_recently_used_ = sub_langs_[i];
891
word = &word_data->lang_words[i];
800
892
if (!word->tess_failed && word->tess_accepted)
801
return; // No need to look at the others.
893
break; // No need to look at the others.
898
if (word != word_data->word) {
899
// Move the result for the best language to the main word.
900
word_data->word->ConsumeWordResults(word);
806
902
clock_t ocr_t = clock();
807
903
if (tessedit_timing_debug) {
808
904
tprintf("%s (ocr took %.2f sec)\n",
940
1048
// Use the tops and bottoms since they are available.
941
1049
TrainedXheightFix(word, block, row);
943
// Test for small caps. Word capheight must be close to block xheight,
944
// and word must contain no lower case letters, and at least one upper case.
945
double small_cap_xheight = block->x_height() * kXHeightCapRatio;
946
double small_cap_delta = (block->x_height() - small_cap_xheight) / 2.0;
947
if (unicharset.script_has_xheight() &&
948
small_cap_xheight - small_cap_delta <= word->x_height &&
949
word->x_height <= small_cap_xheight + small_cap_delta) {
950
// Scan for upper/lower.
953
for (int i = 0; i < word->best_choice->length(); ++i) {
954
if (unicharset.get_isupper(word->best_choice->unichar_id(i)))
956
else if (unicharset.get_islower(word->best_choice->unichar_id(i)))
959
if (num_upper > 0 && num_lower == 0)
960
word->small_caps = true;
962
word->SetScriptPositions();
964
1052
set_global_subloc_code(SUBLOC_NORM);
1136
1220
WERD_RES* rep_word =
1137
1221
page_res_it->InsertSimpleCloneWord(*word_res, blob_word);
1138
1222
// Setup the single char WERD_RES
1139
if (rep_word->SetupForTessRecognition(*word_res->uch_set, this, BestPix(),
1141
textord_use_cjk_fp_model,
1142
poly_allow_detailed_fx,
1143
page_res_it->row()->row,
1144
page_res_it->block()->block)) {
1223
if (rep_word->SetupForRecognition(*word_res->uch_set, this, BestPix(),
1224
tessedit_ocr_engine_mode, NULL, false,
1225
textord_use_cjk_fp_model,
1226
poly_allow_detailed_fx,
1227
page_res_it->row()->row,
1228
page_res_it->block()->block)) {
1145
1229
rep_word->CloneChoppedToRebuild();
1146
1230
BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice);
1147
1231
rep_word->FakeClassifyWord(1, &blob_choice);