~ubuntu-branches/ubuntu/vivid/tesseract/vivid

« back to all changes in this revision

Viewing changes to wordrec/wordrec.cpp

  • Committer: Package Import Robot
  • Author(s): Jeff Breidenbach
  • Date: 2014-02-03 11:10:20 UTC
  • mfrom: (1.3.1) (19.1.1 experimental)
  • Revision ID: package-import@ubuntu.com-20140203111020-igquodd7pjlp3uri
Tags: 3.03.01-1
* New upstream release, includes critical fix to PDF rendering
* Complete leptonlib transition (see bug #735509)
* Promote from experimental to unstable

Show diffs side-by-side

added added

removed removed

Lines of Context:
36
36
              "force associator to run regardless of what enable_assoc is."
37
37
              "This is used for CJK where component grouping is necessary.",
38
38
              CCUtil::params()),
39
 
  INT_MEMBER(wordrec_num_seg_states, 30, "Segmentation states",
40
 
             CCUtil::params()),
41
39
  double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state",
42
40
                params()),
43
41
  BOOL_MEMBER(fragments_guide_chopper, FALSE,
59
57
             params()),
60
58
  INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline",
61
59
             params()),
 
60
  INT_MEMBER(chop_seam_pile_size, 150, "Max number of seams in seam_pile",
 
61
             params()),
 
62
  BOOL_MEMBER(chop_new_seam_pile, 1, "Use new seam_pile", params()),
62
63
  INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend",
63
64
             params()),
64
65
  INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area",
69
70
                params()),
70
71
  double_MEMBER(chop_center_knob, 0.15, "Split center adjustment",
71
72
                params()),
 
73
  INT_MEMBER(chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs "
 
74
             "above which we don't care that a chop is not near the center.",
 
75
             params()),
72
76
  double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment",
73
77
                params()),
74
78
  double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment",
84
88
  BOOL_MEMBER(assume_fixed_pitch_char_segment, FALSE,
85
89
              "include fixed-pitch heuristics in char segmentation",
86
90
              params()),
87
 
  BOOL_MEMBER(use_new_state_cost, FALSE,
88
 
              "use new state cost heuristics for segmentation state evaluation",
89
 
              params()),
90
 
  double_MEMBER(heuristic_segcost_rating_base, 1.25,
91
 
                "base factor for adding segmentation cost into word rating."
92
 
                "It's a multiplying factor, the larger the value above 1, "
93
 
                "the bigger the effect of segmentation cost.",
94
 
                params()),
95
 
  double_MEMBER(heuristic_weight_rating, 1.0,
96
 
                "weight associated with char rating in combined cost of state",
97
 
                params()),
98
 
  double_MEMBER(heuristic_weight_width, 1000.0,
99
 
                "weight associated with width evidence in combined cost of"
100
 
                " state", params()),
101
 
  double_MEMBER(heuristic_weight_seamcut, 0.0,
102
 
                "weight associated with seam cut in combined cost of state",
103
 
                params()),
104
 
  double_MEMBER(heuristic_max_char_wh_ratio, 2.0,
105
 
                "max char width-to-height ratio allowed in segmentation",
106
 
                params()),
107
91
  INT_MEMBER(wordrec_debug_level, 0,
108
92
             "Debug level for wordrec", params()),
 
93
  INT_MEMBER(wordrec_max_join_chunks, 4,
 
94
             "Max number of broken pieces to associate", params()),
 
95
  BOOL_MEMBER(wordrec_skip_no_truth_words, false,
 
96
              "Only run OCR for words that had truth recorded in BlamerBundle",
 
97
              params()),
109
98
  BOOL_MEMBER(wordrec_debug_blamer, false,
110
99
              "Print blamer debug messages", params()),
111
100
  BOOL_MEMBER(wordrec_run_blamer, false,
112
101
              "Try to set the blame for errors", params()),
113
 
  BOOL_MEMBER(enable_new_segsearch, true,
114
 
                   "Enable new segmentation search path.", params()),
115
102
  INT_MEMBER(segsearch_debug_level, 0,
116
103
             "SegSearch debug level", params()),
117
104
  INT_MEMBER(segsearch_max_pain_points, 2000,
118
105
             "Maximum number of pain points stored in the queue",
119
106
             params()),
120
 
  INT_MEMBER(segsearch_max_futile_classifications, 10,
121
 
             "Maximum number of pain point classifications per word that"
 
107
  INT_MEMBER(segsearch_max_futile_classifications, 20,
 
108
             "Maximum number of pain point classifications per chunk that"
122
109
             "did not result in finding a better word choice.",
123
110
             params()),
124
111
  double_MEMBER(segsearch_max_char_wh_ratio, 2.0,
125
112
                "Maximum character width-to-height ratio", params()),
126
 
  double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
127
 
                "Maximum character width-to-height ratio for"
128
 
                " fixed-pitch fonts",
129
 
                params()),
130
 
  BOOL_MEMBER(save_alt_choices, false,
 
113
  BOOL_MEMBER(save_alt_choices, true,
131
114
              "Save alternative paths found during chopping"
132
115
              " and segmentation search",
133
116
              params()) {
134
117
  prev_word_best_choice_ = NULL;
135
118
  language_model_ = new LanguageModel(&get_fontinfo_table(),
136
119
                                      &(getDict()));
137
 
  pass2_seg_states = 0;
138
 
  num_joints = 0;
139
 
  num_pushed = 0;
140
 
  num_popped = 0;
141
120
  fill_lattice_ = NULL;
142
121
}
143
122
 
145
124
  delete language_model_;
146
125
}
147
126
 
148
 
void Wordrec::CopyCharChoices(const BLOB_CHOICE_LIST_VECTOR &from,
149
 
                              BLOB_CHOICE_LIST_VECTOR *to) {
150
 
  to->delete_data_pointers();
151
 
  to->clear();
152
 
  for (int i = 0; i < from.size(); ++i) {
153
 
    BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST();
154
 
    cc_list->deep_copy(from[i], &BLOB_CHOICE::deep_copy);
155
 
    to->push_back(cc_list);
156
 
  }
157
 
}
158
 
 
159
 
bool Wordrec::ChoiceIsCorrect(const UNICHARSET &uni_set,
160
 
                              const WERD_CHOICE *choice,
161
 
                              const GenericVector<STRING> &truth_text) {
162
 
  if (choice == NULL) return false;
163
 
  int i;
164
 
  STRING truth_str;
165
 
  for (i = 0; i < truth_text.length(); ++i) truth_str += truth_text[i];
166
 
  STRING normed_choice_str;
167
 
  for (i = 0; i < choice->length(); ++i) {
168
 
    normed_choice_str += uni_set.get_normed_unichar(choice->unichar_id(i));
169
 
  }
170
 
  return (truth_str == normed_choice_str);
171
 
}
172
 
 
173
 
void Wordrec::SaveAltChoices(const LIST &best_choices, WERD_RES *word) {
174
 
  ASSERT_HOST(word->alt_choices.empty());
175
 
  ASSERT_HOST(word->alt_states.empty());
176
 
  LIST list_it;
177
 
  iterate_list(list_it, best_choices) {
178
 
    VIABLE_CHOICE choice =
179
 
        reinterpret_cast<VIABLE_CHOICE>(first_node(list_it));
180
 
    CHAR_CHOICE *char_choice = &(choice->Blob[0]);
181
 
    WERD_CHOICE *alt_choice = new WERD_CHOICE(word->uch_set, choice->Length);
182
 
    word->alt_states.push_back(GenericVector<int>(choice->Length));
183
 
    GenericVector<int> &alt_state = word->alt_states.back();
184
 
    for (int i = 0; i < choice->Length; char_choice++, i++) {
185
 
      alt_choice->append_unichar_id_space_allocated(
186
 
          char_choice->Class, 1, 0, 0);
187
 
      alt_state.push_back(char_choice->NumChunks);
188
 
    }
189
 
    alt_choice->set_rating(choice->Rating);
190
 
    alt_choice->set_certainty(choice->Certainty);
191
 
 
192
 
    ASSERT_HOST(choice->blob_choices != NULL);
193
 
    alt_choice->set_blob_choices(choice->blob_choices);
194
 
    choice->blob_choices = NULL;
195
 
 
196
 
    word->alt_choices.push_back(alt_choice);
197
 
    if (wordrec_debug_level > 0) {
198
 
      tprintf("SaveAltChoices: %s %g\n",
199
 
              alt_choice->unichar_string().string(), alt_choice->rating());
200
 
    }
201
 
  }
202
 
}
203
 
 
204
127
}  // namespace tesseract