167
160
* @note History: Mon Mar 11 10:00:58 1991, DSJ, Created.
169
162
* @param Blob blob to be classified
170
* @param denorm normalization/denormalization parameters
171
163
* @param[out] Choices List of choices found by adaptive matcher.
172
* @param[out] CPResults Array of CPResultStruct of size MAX_NUM_CLASSES is
173
164
* filled on return with the choices found by the
174
165
* class pruner and the ratings therefrom. Also
175
166
* contains the detailed results of the integer matcher.
178
void Classify::AdaptiveClassifier(TBLOB *Blob,
179
const DENORM& denorm,
180
BLOB_CHOICE_LIST *Choices,
181
CLASS_PRUNER_RESULTS CPResults) {
169
void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) {
182
170
assert(Choices != NULL);
183
ADAPT_RESULTS *Results = new ADAPT_RESULTS();
171
ADAPT_RESULTS *Results = new ADAPT_RESULTS;
184
172
Results->Initialize();
186
if (AdaptedTemplates == NULL)
187
AdaptedTemplates = NewAdaptedTemplates (true);
188
DoAdaptiveMatch(Blob, denorm, Results);
189
if (CPResults != NULL)
190
memcpy(CPResults, Results->CPResults,
191
sizeof(CPResults[0]) * Results->NumMatches);
174
ASSERT_HOST(AdaptedTemplates != NULL);
176
DoAdaptiveMatch(Blob, Results);
193
178
RemoveBadMatches(Results);
194
qsort((void *)Results->match, Results->NumMatches,
195
sizeof(ScoredClass), CompareByRating);
179
Results->match.sort(CompareByRating);
196
180
RemoveExtraPuncs(Results);
197
ConvertMatchesToChoices(denorm, Blob->bounding_box(), Results, Choices);
181
ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results,
199
184
if (matcher_debug_level >= 1) {
200
185
cprintf ("AD Matches = ");
201
186
PrintAdaptiveMatchResults(stdout, Results);
204
if (LargeSpeckle(Blob))
205
AddLargeSpeckleTo(Choices);
189
if (LargeSpeckle(*Blob) || Choices->length() == 0)
190
AddLargeSpeckleTo(Results->BlobLength, Choices);
207
192
#ifndef GRAPHICS_DISABLED
208
193
if (classify_enable_adaptive_debugger)
209
DebugAdaptiveClassifier(Blob, denorm, Results);
194
DebugAdaptiveClassifier(Blob, Results);
212
NumClassesOutput += Choices->length();
213
if (Choices->length() == 0) {
214
if (!classify_bln_numeric_mode)
215
tprintf ("Empty classification!\n"); // Should never normally happen.
216
Choices = new BLOB_CHOICE_LIST();
217
BLOB_CHOICE_IT temp_it;
218
temp_it.set_to_list(Choices);
220
new BLOB_CHOICE(0, 50.0f, -20.0f, -1, -1, NULL, 0, 0, false));
224
198
} /* AdaptiveClassifier */
251
225
// Otherwise AdaptToBlob is called for adaption within a document.
252
226
// If rejmap is not NULL, then only chars with a rejmap entry of '1' will
253
227
// be learned, otherwise all chars with good correct_text are learned.
254
void Classify::LearnWord(const char* filename, const char *rejmap,
228
void Classify::LearnWord(const char* filename, WERD_RES *word) {
256
229
int word_len = word->correct_text.size();
257
230
if (word_len == 0) return;
259
232
float* thresholds = NULL;
260
233
if (filename == NULL) {
261
234
// Adaption mode.
262
if (!EnableLearning || word->best_choice == NULL ||
263
// If word->best_choice is not recorded at the top of accumulator's
264
// best choices (which could happen for choices that are
265
// altered with ReplaceAmbig()) we skip the adaption.
266
!getDict().CurrentBestChoiceIs(*(word->best_choice)))
235
if (!EnableLearning || word->best_choice == NULL)
267
236
return; // Can't or won't adapt.
270
238
if (classify_learning_debug_level >= 1)
271
239
tprintf("\n\nAdapting to word = %s\n",
272
240
word->best_choice->debug_string().string());
273
241
thresholds = new float[word_len];
274
GetAdaptThresholds(word->rebuild_word, word->denorm, *word->best_choice,
275
*word->raw_choice, thresholds);
242
word->ComputeAdaptionThresholds(certainty_scale,
243
matcher_perfect_threshold,
244
matcher_good_threshold,
245
matcher_rating_margin, thresholds);
277
247
int start_blob = 0;
278
char prev_map_char = '0';
280
249
#ifndef GRAPHICS_DISABLED
281
250
if (classify_debug_character_fragments) {
346
311
// TODO(rays): re-enable this part of the code when we switch to the
347
312
// new classifier that needs to see examples of garbage.
349
char next_map_char = ch + 1 < word_len
350
? (rejmap != NULL ? *rejmap : '1')
352
314
if (word->best_state[ch] > 1) {
353
315
// If the next blob is good, make junk with the rightmost fragment.
354
if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0 &&
355
next_map_char == '1') {
316
if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
356
317
LearnPieces(filename, start_blob + word->best_state[ch] - 1,
357
318
word->best_state[ch + 1] + 1,
358
319
threshold, CST_IMPROPER, INVALID_UNICHAR, word);
360
321
// If the previous blob is good, make junk with the leftmost fragment.
361
if (ch > 0 && word->correct_text[ch - 1].length() > 0 &&
362
prev_map_char == '1') {
322
if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
363
323
LearnPieces(filename, start_blob - word->best_state[ch - 1],
364
324
word->best_state[ch - 1] + 1,
365
325
threshold, CST_IMPROPER, INVALID_UNICHAR, word);
368
328
// If the next blob is good, make a join with it.
369
if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0 &&
370
next_map_char == '1') {
329
if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
371
330
STRING joined_text = word->correct_text[ch];
372
331
joined_text += word->correct_text[ch + 1];
373
332
LearnPieces(filename, start_blob,
639
594
NumAdaptationsFailed);
641
596
free_adapted_templates(AdaptedTemplates);
642
AdaptedTemplates = NULL;
597
AdaptedTemplates = NewAdaptedTemplates(true);
643
598
NumAdaptationsFailed = 0;
647
/*---------------------------------------------------------------------------*/
649
* Print to File the statistics which have
650
* been gathered for the adaptive matcher.
652
* @param File open text file to print adaptive statistics to
656
* @note Exceptions: none
657
* @note History: Thu Apr 18 14:37:37 1991, DSJ, Created.
659
void Classify::PrintAdaptiveStatistics(FILE *File) {
662
fprintf (File, "\nADAPTIVE MATCHER STATISTICS:\n");
663
fprintf (File, "\tNum blobs classified = %d\n", AdaptiveMatcherCalls);
664
fprintf (File, "\tNum classes output = %d (Avg = %4.2f)\n",
666
((AdaptiveMatcherCalls == 0) ? (0.0) :
667
((float) NumClassesOutput / AdaptiveMatcherCalls)));
668
fprintf (File, "\t\tBaseline Classifier: %4d calls (%4.2f classes/call)\n",
669
BaselineClassifierCalls,
670
((BaselineClassifierCalls == 0) ? (0.0) :
671
((float) NumBaselineClassesTried / BaselineClassifierCalls)));
672
fprintf (File, "\t\tCharNorm Classifier: %4d calls (%4.2f classes/call)\n",
673
CharNormClassifierCalls,
674
((CharNormClassifierCalls == 0) ? (0.0) :
675
((float) NumCharNormClassesTried / CharNormClassifierCalls)));
676
fprintf (File, "\t\tAmbig Classifier: %4d calls (%4.2f classes/call)\n",
677
AmbigClassifierCalls,
678
((AmbigClassifierCalls == 0) ? (0.0) :
679
((float) NumAmbigClassesTried / AmbigClassifierCalls)));
681
fprintf (File, "\nADAPTIVE LEARNER STATISTICS:\n");
682
fprintf (File, "\tNumber of words adapted to: %d\n", NumWordsAdaptedTo);
683
fprintf (File, "\tNumber of chars adapted to: %d\n", NumCharsAdaptedTo);
685
PrintAdaptedTemplates(File, AdaptedTemplates);
687
} /* PrintAdaptiveStatistics */
690
603
/*---------------------------------------------------------------------------*/
1068
/*---------------------------------------------------------------------------*/
1070
* @param Blob blob to add to templates for ClassId
1071
* @param denorm normalization/denormalization parameters
1072
* @param ClassId class to add blob to
1073
* @param FontinfoId font information from pre-trained teamples
1074
* @param Threshold minimum match rating to existing template
1077
* - PreTrainedTemplates current set of built-in templates
1079
* @note Exceptions: none
1080
* @note History: Thu Mar 14 09:36:03 1991, DSJ, Created.
1082
void Classify::AdaptToPunc(TBLOB *Blob,
1083
const DENORM& denorm,
1086
FLOAT32 Threshold) {
1087
ADAPT_RESULTS *Results = new ADAPT_RESULTS();
1090
Results->Initialize();
1091
CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
1092
RemoveBadMatches(Results);
1094
if (Results->NumMatches != 1) {
1095
if (classify_learning_debug_level >= 1) {
1096
cprintf ("Rejecting punc = %s (Alternatives = ",
1097
unicharset.id_to_unichar(ClassId));
1099
for (i = 0; i < Results->NumMatches; i++)
1100
tprintf("%s", unicharset.id_to_unichar(Results->match[i].unichar_id));
1104
#ifndef SECURE_NAMES
1105
if (classify_learning_debug_level >= 1)
1106
cprintf ("Adapting to punc = %s, thr= %g\n",
1107
unicharset.id_to_unichar(ClassId), Threshold);
1109
AdaptToChar(Blob, denorm, ClassId, FontinfoId, Threshold);
1115
968
/*---------------------------------------------------------------------------*/
1202
1054
* @note Exceptions: none
1203
1055
* @note History: Tue Mar 12 19:40:36 1991, DSJ, Created.
1205
void Classify::AmbigClassifier(TBLOB *Blob,
1206
const DENORM& denorm,
1207
INT_TEMPLATES Templates,
1208
ADAPT_CLASS *Classes,
1209
UNICHAR_ID *Ambiguities,
1210
ADAPT_RESULTS *Results) {
1212
INT_FEATURE_ARRAY IntFeatures;
1057
void Classify::AmbigClassifier(
1058
const GenericVector<INT_FEATURE_STRUCT>& int_features,
1059
const INT_FX_RESULT_STRUCT& fx_info,
1061
INT_TEMPLATES templates,
1062
ADAPT_CLASS *classes,
1063
UNICHAR_ID *ambiguities,
1064
ADAPT_RESULTS *results) {
1065
if (int_features.empty()) return;
1213
1066
uinT8* CharNormArray = new uinT8[unicharset.size()];
1214
1067
INT_RESULT_STRUCT IntResult;
1217
AmbigClassifierCalls++;
1219
NumFeatures = GetCharNormFeatures(Blob, denorm, Templates, IntFeatures,
1220
NULL, CharNormArray,
1221
&(Results->BlobLength), NULL);
1222
if (NumFeatures <= 0) {
1223
delete [] CharNormArray;
1069
results->BlobLength = GetCharNormFeature(fx_info, templates, NULL,
1227
1071
bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1229
1073
tprintf("AM Matches = ");
1231
int top = Blob->bounding_box().top();
1232
int bottom = Blob->bounding_box().bottom();
1233
while (*Ambiguities >= 0) {
1234
ClassId = *Ambiguities;
1075
int top = blob->bounding_box().top();
1076
int bottom = blob->bounding_box().bottom();
1077
while (*ambiguities >= 0) {
1078
CLASS_ID class_id = *ambiguities;
1236
im_.SetCharNormMatch(classify_integer_matcher_multiplier);
1237
im_.Match(ClassForClassId(Templates, ClassId),
1080
im_.Match(ClassForClassId(templates, class_id),
1238
1081
AllProtosOn, AllConfigsOn,
1239
NumFeatures, IntFeatures,
1082
int_features.size(), &int_features[0],
1241
1084
classify_adapt_feature_threshold, NO_DEBUG,
1242
1085
matcher_debug_separate_windows);
1244
ExpandShapesAndApplyCorrections(NULL, debug, ClassId, bottom, top, 0,
1245
Results->BlobLength, CharNormArray,
1246
IntResult, Results);
1249
NumAmbigClassesTried++;
1087
ExpandShapesAndApplyCorrections(NULL, debug, class_id, bottom, top, 0,
1088
results->BlobLength,
1089
classify_integer_matcher_multiplier,
1090
CharNormArray, IntResult, results);
1251
1093
delete [] CharNormArray;
1252
1094
} /* AmbigClassifier */
1420
1264
* @note Exceptions: none
1421
1265
* @note History: Tue Mar 12 19:38:03 1991, DSJ, Created.
1423
UNICHAR_ID *Classify::BaselineClassifier(TBLOB *Blob,
1424
const DENORM& denorm,
1425
ADAPT_TEMPLATES Templates,
1426
ADAPT_RESULTS *Results) {
1429
INT_FEATURE_ARRAY IntFeatures;
1267
UNICHAR_ID *Classify::BaselineClassifier(
1268
TBLOB *Blob, const GenericVector<INT_FEATURE_STRUCT>& int_features,
1269
const INT_FX_RESULT_STRUCT& fx_info,
1270
ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results) {
1271
if (int_features.empty()) return NULL;
1430
1272
uinT8* CharNormArray = new uinT8[unicharset.size()];
1433
BaselineClassifierCalls++;
1435
NumFeatures = GetBaselineFeatures(
1436
Blob, denorm, Templates->Templates, IntFeatures, CharNormArray,
1437
&(Results->BlobLength));
1438
if (NumFeatures <= 0) {
1439
delete [] CharNormArray;
1443
NumClasses = PruneClasses(Templates->Templates, NumFeatures, IntFeatures,
1444
CharNormArray, BaselineCutoffs, Results->CPResults);
1446
NumBaselineClassesTried += NumClasses;
1273
ClearCharNormArray(CharNormArray);
1275
Results->BlobLength = IntCastRounded(fx_info.Length / kStandardFeatureLength);
1276
PruneClasses(Templates->Templates, int_features.size(), &int_features[0],
1277
CharNormArray, BaselineCutoffs, &Results->CPResults);
1448
1279
if (matcher_debug_level >= 2 || classify_debug_level > 1)
1449
1280
cprintf ("BL Matches = ");
1451
im_.SetBaseLineMatch();
1452
MasterMatcher(Templates->Templates, NumFeatures, IntFeatures, CharNormArray,
1453
Templates->Class, matcher_debug_flags, NumClasses,
1282
MasterMatcher(Templates->Templates, int_features.size(), &int_features[0],
1284
Templates->Class, matcher_debug_flags, 0,
1454
1285
Blob->bounding_box(), Results->CPResults, Results);
1456
1287
delete [] CharNormArray;
1457
ClassId = Results->best_match.unichar_id;
1288
CLASS_ID ClassId = Results->best_match.unichar_id;
1458
1289
if (ClassId == NO_CLASS)
1460
1291
/* this is a bug - maybe should return "" */
1484
1314
* @note Exceptions: none
1485
1315
* @note History: Tue Mar 12 16:02:52 1991, DSJ, Created.
1487
int Classify::CharNormClassifier(TBLOB *Blob,
1488
const DENORM& denorm,
1489
INT_TEMPLATES Templates,
1490
ADAPT_RESULTS *Results) {
1493
INT_FEATURE_ARRAY IntFeatures;
1495
CharNormClassifierCalls++;
1497
uinT8* CharNormArray = new uinT8[unicharset.size()];
1498
int num_pruner_classes = MAX(unicharset.size(),
1499
PreTrainedTemplates->NumClasses);
1500
uinT8* PrunerNormArray = new uinT8[num_pruner_classes];
1501
NumFeatures = GetCharNormFeatures(Blob, denorm, Templates, IntFeatures,
1502
PrunerNormArray, CharNormArray,
1503
&(Results->BlobLength), NULL);
1504
if (NumFeatures <= 0) {
1505
delete [] CharNormArray;
1506
delete [] PrunerNormArray;
1317
int Classify::CharNormClassifier(TBLOB *blob,
1318
const TrainingSample& sample,
1319
ADAPT_RESULTS *adapt_results) {
1320
// This is the length that is used for scaling ratings vs certainty.
1321
adapt_results->BlobLength =
1322
IntCastRounded(sample.outline_length() / kStandardFeatureLength);
1323
GenericVector<UnicharRating> unichar_results;
1324
static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0,
1325
-1, &unichar_results);
1326
// Convert results to the format used internally by AdaptiveClassifier.
1327
for (int r = 0; r < unichar_results.size(); ++r) {
1328
int unichar_id = unichar_results[r].unichar_id;
1329
// Fonts are listed in order of preference.
1330
int font1 = unichar_results[r].fonts.size() >= 1
1331
? unichar_results[r].fonts[0] : kBlankFontinfoId;
1332
int font2 = unichar_results[r].fonts.size() >= 2
1333
? unichar_results[r].fonts[1] : kBlankFontinfoId;
1334
float rating = 1.0f - unichar_results[r].rating;
1335
AddNewResult(adapt_results, unichar_id, -1, rating, false, 0, font1, font2);
1510
NumClasses = PruneClasses(Templates, NumFeatures, IntFeatures,
1512
shape_table_ != NULL ? &shapetable_cutoffs_[0]
1514
Results->CPResults);
1516
if (tessedit_single_match && NumClasses > 1)
1518
NumCharNormClassesTried += NumClasses;
1520
im_.SetCharNormMatch(classify_integer_matcher_multiplier);
1521
MasterMatcher(Templates, NumFeatures, IntFeatures, CharNormArray,
1522
NULL, matcher_debug_flags, NumClasses,
1523
Blob->bounding_box(), Results->CPResults, Results);
1524
delete [] CharNormArray;
1525
delete [] PrunerNormArray;
1337
return sample.num_features();
1527
1338
} /* CharNormClassifier */
1529
1340
// As CharNormClassifier, but operates on a TrainingSample and outputs to
1530
1341
// a GenericVector of ShapeRating without conversion to classes.
1531
1342
int Classify::CharNormTrainingSample(bool pruner_only,
1532
1344
const TrainingSample& sample,
1533
GenericVector<ShapeRating>* results) {
1345
GenericVector<UnicharRating>* results) {
1534
1346
results->clear();
1535
1347
ADAPT_RESULTS* adapt_results = new ADAPT_RESULTS();
1536
1348
adapt_results->Initialize();
1537
1349
// Compute the bounding box of the features.
1538
1350
int num_features = sample.num_features();
1540
for (int f = 0; f < num_features; ++f) {
1541
const INT_FEATURE_STRUCT feature = sample.features()[f];
1542
TBOX fbox(feature.X, feature.Y, feature.X, feature.Y);
1351
// Only the top and bottom of the blob_box are used by MasterMatcher, so
1352
// fabricate right and left using top and bottom.
1353
TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
1354
sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
1545
1355
// Compute the char_norm_array from the saved cn_feature.
1546
FEATURE norm_feature = NewFeature(&CharNormDesc);
1547
norm_feature->Params[CharNormY] = sample.cn_feature(CharNormY);
1548
norm_feature->Params[CharNormLength] = sample.cn_feature(CharNormLength);
1549
norm_feature->Params[CharNormRx] = sample.cn_feature(CharNormRx);
1550
norm_feature->Params[CharNormRy] = sample.cn_feature(CharNormRy);
1356
FEATURE norm_feature = sample.GetCNFeature();
1551
1357
uinT8* char_norm_array = new uinT8[unicharset.size()];
1552
1358
int num_pruner_classes = MAX(unicharset.size(),
1553
1359
PreTrainedTemplates->NumClasses);
1557
1363
ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
1558
1364
pruner_norm_array);
1560
int num_classes = PruneClasses(PreTrainedTemplates, num_features,
1563
shape_table_ != NULL ? &shapetable_cutoffs_[0]
1565
adapt_results->CPResults);
1366
PruneClasses(PreTrainedTemplates, num_features, sample.features(),
1368
shape_table_ != NULL ? &shapetable_cutoffs_[0] : CharNormCutoffs,
1369
&adapt_results->CPResults);
1566
1370
delete [] pruner_norm_array;
1371
if (keep_this >= 0) {
1372
adapt_results->CPResults[0].Class = keep_this;
1373
adapt_results->CPResults.truncate(1);
1567
1375
if (pruner_only) {
1568
1376
// Convert pruner results to output format.
1569
for (int i = 0; i < num_classes; ++i) {
1377
for (int i = 0; i < adapt_results->CPResults.size(); ++i) {
1570
1378
int class_id = adapt_results->CPResults[i].Class;
1571
int shape_id = class_id;
1572
if (shape_table_ != NULL) {
1573
// All shapes in a class have the same combination of unichars, so
1574
// it doesn't really matter which config we give it, as we aren't
1575
// trying to get the font here.
1576
shape_id = ClassAndConfigIDToFontOrShapeID(class_id, 0);
1578
1379
results->push_back(
1579
ShapeRating(shape_id, 1.0f - adapt_results->CPResults[i].Rating));
1380
UnicharRating(class_id, 1.0f - adapt_results->CPResults[i].Rating));
1582
im_.SetCharNormMatch(classify_integer_matcher_multiplier);
1583
1383
MasterMatcher(PreTrainedTemplates, num_features, sample.features(),
1584
1384
char_norm_array,
1585
NULL, matcher_debug_flags, num_classes,
1385
NULL, matcher_debug_flags,
1386
classify_integer_matcher_multiplier,
1586
1387
blob_box, adapt_results->CPResults, adapt_results);
1587
1388
// Convert master matcher results to output format.
1588
for (int i = 0; i < adapt_results->NumMatches; i++) {
1389
for (int i = 0; i < adapt_results->match.size(); i++) {
1589
1390
ScoredClass next = adapt_results->match[i];
1590
results->push_back(ShapeRating(next.shape_id, 1.0f - next.rating));
1391
UnicharRating rating(next.unichar_id, 1.0f - next.rating);
1392
if (next.fontinfo_id >= 0) {
1393
rating.fonts.push_back(next.fontinfo_id);
1394
if (next.fontinfo_id2 >= 0)
1395
rating.fonts.push_back(next.fontinfo_id2);
1397
results->push_back(rating);
1592
results->sort(&ShapeRating::SortDescendingRating);
1399
results->sort(&UnicharRating::SortDescendingRating);
1594
1401
delete [] char_norm_array;
1595
1402
delete adapt_results;
1717
1525
Rating *= rating_scale * Results->BlobLength;
1718
1526
Certainty *= -(getDict().certainty_scale);
1720
inT16 min_xheight, max_xheight;
1528
// Adapted results, by their very nature, should have good certainty.
1529
// Those that don't are at best misleading, and often lead to errors,
1530
// so don't accept adapted results that are too far behind the best result,
1531
// whether adapted or static.
1532
// TODO(rays) find some way of automatically tuning these constants.
1533
if (Certainty > best_certainty) {
1534
best_certainty = MIN(Certainty, classify_adapted_pruning_threshold);
1535
} else if (adapted &&
1536
Certainty / classify_adapted_pruning_factor < best_certainty) {
1537
continue; // Don't accept bad adapted results.
1540
float min_xheight, max_xheight, yshift;
1721
1541
denorm.XHeightRange(next.unichar_id, unicharset, box,
1722
&min_xheight, &max_xheight);
1542
&min_xheight, &max_xheight, &yshift);
1723
1543
temp_it.add_to_end(new BLOB_CHOICE(next.unichar_id, Rating, Certainty,
1724
1544
fontinfo_id, fontinfo_id2,
1725
1545
unicharset.get_script(next.unichar_id),
1726
min_xheight, max_xheight, adapted));
1546
min_xheight, max_xheight, yshift,
1547
adapted ? BCC_ADAPTED_CLASSIFIER
1548
: BCC_STATIC_CLASSIFIER));
1727
1549
contains_nonfrag |= !current_is_frag; // update contains_nonfrag
1728
1550
choices_length++;
1729
1551
if (choices_length >= max_matches) break;
1731
Results->NumMatches = choices_length;
1553
Results->match.truncate(choices_length);
1732
1554
} // ConvertMatchesToChoices
1745
1566
* @note Exceptions: none
1746
1567
* @note History: Wed Mar 13 16:44:41 1991, DSJ, Created.
1748
void Classify::DebugAdaptiveClassifier(TBLOB *Blob,
1749
const DENORM& denorm,
1569
void Classify::DebugAdaptiveClassifier(TBLOB *blob,
1750
1570
ADAPT_RESULTS *Results) {
1751
for (int i = 0; i < Results->NumMatches; i++) {
1752
if (Results->match[i].rating < Results->best_match.rating)
1571
if (static_classifier_ == NULL) return;
1572
for (int i = 0; i < Results->match.size(); i++) {
1573
if (i == 0 || Results->match[i].rating < Results->best_match.rating)
1753
1574
Results->best_match = Results->match[i];
1755
const char *Prompt =
1756
"Left-click in IntegerMatch Window to continue or right click to debug...";
1757
CLASS_ID unichar_id = Results->best_match.unichar_id;
1758
int shape_id = Results->best_match.shape_id;
1759
bool adaptive_on = true;
1760
bool pretrained_on = true;
1762
const char* debug_mode;
1765
debug_mode = "Adaptive Templates Only";
1766
else if (!adaptive_on)
1767
debug_mode = "PreTrained Templates Only";
1769
debug_mode = "All Templates";
1771
tprintf("Debugging class %d = %s in mode %s ...",
1772
unichar_id, unicharset.id_to_unichar(unichar_id), debug_mode);
1773
if (shape_id >= 0 && shape_table_ != NULL) {
1774
tprintf(" from shape %s\n", shape_table_->DebugStr(shape_id).string());
1776
ShowBestMatchFor(Blob, denorm, unichar_id, shape_id, adaptive_on,
1777
pretrained_on, Results);
1778
UpdateMatchDisplay();
1779
} while ((unichar_id = GetClassToDebug(Prompt, &adaptive_on,
1780
&pretrained_on, &shape_id)) != 0);
1576
INT_FX_RESULT_STRUCT fx_info;
1577
GenericVector<INT_FEATURE_STRUCT> bl_features;
1578
TrainingSample* sample =
1579
BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
1580
if (sample == NULL) return;
1581
static_classifier_->DebugDisplay(*sample, blob->denorm().pix(),
1582
Results->best_match.unichar_id);
1781
1583
} /* DebugAdaptiveClassifier */
1805
1606
* @note Exceptions: none
1806
1607
* @note History: Tue Mar 12 08:50:11 1991, DSJ, Created.
1808
void Classify::DoAdaptiveMatch(TBLOB *Blob,
1809
const DENORM& denorm,
1810
ADAPT_RESULTS *Results) {
1609
void Classify::DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results) {
1811
1610
UNICHAR_ID *Ambiguities;
1813
AdaptiveMatcherCalls++;
1612
INT_FX_RESULT_STRUCT fx_info;
1613
GenericVector<INT_FEATURE_STRUCT> bl_features;
1614
TrainingSample* sample =
1615
BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info,
1617
if (sample == NULL) return;
1816
1619
if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min ||
1817
1620
tess_cn_matching) {
1818
CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
1621
CharNormClassifier(Blob, *sample, Results);
1820
Ambiguities = BaselineClassifier(Blob, denorm, AdaptedTemplates, Results);
1821
if ((Results->NumMatches > 0 &&
1822
MarginalMatch (Results->best_match.rating) &&
1623
Ambiguities = BaselineClassifier(Blob, bl_features, fx_info,
1624
AdaptedTemplates, Results);
1625
if ((!Results->match.empty() && MarginalMatch(Results->best_match.rating) &&
1823
1626
!tess_bn_matching) ||
1824
Results->NumMatches == 0) {
1825
CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
1627
Results->match.empty()) {
1628
CharNormClassifier(Blob, *sample, Results);
1826
1629
} else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
1827
AmbigClassifier(Blob, denorm,
1630
AmbigClassifier(bl_features, fx_info, Blob,
1828
1631
PreTrainedTemplates,
1829
1632
AdaptedTemplates->Class,
1836
1639
// if the results contain only fragments.
1837
1640
// TODO(daria): verify that this is better than
1838
1641
// just adding a NULL classification.
1839
if (!Results->HasNonfragment || Results->NumMatches == 0)
1642
if (!Results->HasNonfragment || Results->match.empty())
1840
1643
ClassifyAsNoise(Results);
1841
1645
} /* DoAdaptiveMatch */
1843
1647
/*---------------------------------------------------------------------------*/
1845
* This routine tries to estimate how tight the adaptation
1846
* threshold should be set for each character in the current
1847
* word. In general, the routine tries to set tighter
1848
* thresholds for a character when the current set of templates
1849
* would have made an error on that character. It tries
1850
* to set a threshold tight enough to eliminate the error.
1851
* Two different sets of rules can be used to determine the
1852
* desired thresholds.
1854
* @param Word current word
1855
* @param denorm normalization/denormalization parameters
1856
* @param BestChoice best choice for current word with context
1857
* @param BestRawChoice best choice for current word without context
1858
* @param[out] Thresholds array of thresholds to be filled in
1861
* - matcher_good_threshold
1862
* - matcher_perfect_threshold
1863
* - matcher_rating_margin
1865
* @return none (results are returned in Thresholds)
1866
* @note Exceptions: none
1867
* @note History: Fri May 31 09:22:08 1991, DSJ, Created.
1869
void Classify::GetAdaptThresholds(TWERD * Word,
1870
const DENORM& denorm,
1871
const WERD_CHOICE& BestChoice,
1872
const WERD_CHOICE& BestRawChoice,
1873
FLOAT32 Thresholds[]) {
1874
getDict().FindClassifierErrors(matcher_perfect_threshold,
1875
matcher_good_threshold,
1876
matcher_rating_margin,
1878
} /* GetAdaptThresholds */
1880
/*---------------------------------------------------------------------------*/
1882
1649
* This routine matches blob to the built-in templates
1883
1650
* to find out if there are any classes other than the correct
1884
1651
* class which are potential ambiguities.
1886
1653
* @param Blob blob to get classification ambiguities for
1887
* @param denorm normalization/denormalization parameters
1888
1654
* @param CorrectClass correct class for Blob
1896
1662
* @note History: Fri Mar 15 08:08:22 1991, DSJ, Created.
1898
1664
UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob,
1899
const DENORM& denorm,
1900
1665
CLASS_ID CorrectClass) {
1901
1666
ADAPT_RESULTS *Results = new ADAPT_RESULTS();
1902
1667
UNICHAR_ID *Ambiguities;
1905
1670
Results->Initialize();
1671
INT_FX_RESULT_STRUCT fx_info;
1672
GenericVector<INT_FEATURE_STRUCT> bl_features;
1673
TrainingSample* sample =
1674
BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info,
1676
if (sample == NULL) {
1907
CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
1681
CharNormClassifier(Blob, *sample, Results);
1908
1683
RemoveBadMatches(Results);
1909
qsort((void *)Results->match, Results->NumMatches,
1910
sizeof(ScoredClass), CompareByRating);
1684
Results->match.sort(CompareByRating);
1912
1686
/* copy the class id's into an string of ambiguities - don't copy if
1913
1687
the correct class is the only class id matched */
1914
Ambiguities = (UNICHAR_ID *) Emalloc (sizeof (UNICHAR_ID) *
1915
(Results->NumMatches + 1));
1916
if (Results->NumMatches > 1 ||
1917
(Results->NumMatches == 1 &&
1688
Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
1689
if (Results->match.size() > 1 ||
1690
(Results->match.size() == 1 &&
1918
1691
Results->match[0].unichar_id != CorrectClass)) {
1919
for (i = 0; i < Results->NumMatches; i++)
1692
for (i = 0; i < Results->match.size(); i++)
1920
1693
Ambiguities[i] = Results->match[i].unichar_id;
1921
1694
Ambiguities[i] = -1;
1927
1700
return Ambiguities;
1928
1701
} /* GetAmbiguities */
1930
/*---------------------------------------------------------------------------*/
1932
* This routine calls the integer (Hardware) feature
1933
* extractor if it has not been called before for this blob.
1934
* The results from the feature extractor are placed into
1935
* globals so that they can be used in other routines without
1936
* re-extracting the features.
1937
* It then copies the baseline features into the IntFeatures
1938
* array provided by the caller.
1940
* @param Blob blob to extract features from
1941
* @param denorm normalization/denormalization parameters
1942
* @param Templates used to compute char norm adjustments
1943
* @param IntFeatures array to fill with integer features
1944
* @param CharNormArray array to fill with dummy char norm adjustments
1945
* @param BlobLength length of blob in baseline-normalized units
1948
* - FeaturesHaveBeenExtracted TRUE if fx has been done
1949
* - BaselineFeatures holds extracted baseline feat
1950
* - CharNormFeatures holds extracted char norm feat
1951
* - FXInfo holds misc. FX info
1953
* @return Number of features extracted or 0 if an error occured.
1954
* @note Exceptions: none
1955
* @note History: Tue May 28 10:40:52 1991, DSJ, Created.
1957
int Classify::GetBaselineFeatures(TBLOB *Blob,
1958
const DENORM& denorm,
1959
INT_TEMPLATES Templates,
1960
INT_FEATURE_ARRAY IntFeatures,
1961
uinT8* CharNormArray,
1962
inT32 *BlobLength) {
1963
register INT_FEATURE Src, Dest, End;
1965
if (!FeaturesHaveBeenExtracted) {
1966
FeaturesOK = ExtractIntFeat(Blob, denorm, BaselineFeatures,
1967
CharNormFeatures, &FXInfo, NULL);
1968
FeaturesHaveBeenExtracted = TRUE;
1972
*BlobLength = FXInfo.NumBL;
1976
for (Src = BaselineFeatures, End = Src + FXInfo.NumBL, Dest = IntFeatures;
1980
ClearCharNormArray(CharNormArray);
1981
*BlobLength = FXInfo.NumBL;
1982
return FXInfo.NumBL;
1983
} /* GetBaselineFeatures */
1985
void Classify::ResetFeaturesHaveBeenExtracted() {
1986
FeaturesHaveBeenExtracted = FALSE;
1989
1703
// Returns true if the given blob looks too dissimilar to any character
1990
1704
// present in the classifier templates.
1991
bool Classify::LooksLikeGarbage(const DENORM& denorm, TBLOB *blob) {
1705
bool Classify::LooksLikeGarbage(TBLOB *blob) {
1992
1706
BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST();
1993
AdaptiveClassifier(blob, denorm, ratings, NULL);
1707
AdaptiveClassifier(blob, ratings);
1994
1708
BLOB_CHOICE_IT ratings_it(ratings);
1995
1709
const UNICHARSET &unicharset = getDict().getUnicharset();
1996
1710
if (classify_debug_character_fragments) {
2023
1738
* array provided by the caller.
2025
1740
* @param Blob blob to extract features from
2026
* @param denorm normalization/denormalization parameters
2027
1741
* @param Templates used to compute char norm adjustments
2028
1742
* @param IntFeatures array to fill with integer features
2029
1743
* @param PrunerNormArray Array of factors from blob normalization
2031
1745
* @param CharNormArray array to fill with dummy char norm adjustments
2032
1746
* @param BlobLength length of blob in baseline-normalized units
2033
* @param FeatureOutlineArray
2036
* - FeaturesHaveBeenExtracted TRUE if fx has been done
2037
* - BaselineFeatures holds extracted baseline feat
2038
* - CharNormFeatures holds extracted char norm feat
2039
* - FXInfo holds misc. FX info
2041
1750
* @return Number of features extracted or 0 if an error occured.
2042
1751
* @note Exceptions: none
2043
1752
* @note History: Tue May 28 10:40:52 1991, DSJ, Created.
2045
int Classify::GetCharNormFeatures(TBLOB *Blob,
2046
const DENORM& denorm,
2047
INT_TEMPLATES Templates,
2048
INT_FEATURE_ARRAY IntFeatures,
2049
uinT8* PrunerNormArray,
2050
uinT8* CharNormArray,
2052
inT32 *FeatureOutlineArray) {
2053
register INT_FEATURE Src, Dest, End;
2054
FEATURE NormFeature;
2055
FLOAT32 Baseline, Scale;
2056
inT32 FeatureOutlineIndex[MAX_NUM_INT_FEATURES];
2058
if (!FeaturesHaveBeenExtracted) {
2059
FeaturesOK = ExtractIntFeat(Blob, denorm, BaselineFeatures,
2060
CharNormFeatures, &FXInfo,
2061
FeatureOutlineIndex);
2062
FeaturesHaveBeenExtracted = TRUE;
2066
*BlobLength = FXInfo.NumBL;
2070
for (Src = CharNormFeatures, End = Src + FXInfo.NumCN, Dest = IntFeatures;
2073
for (int i = 0; FeatureOutlineArray && i < FXInfo.NumCN; ++i) {
2074
FeatureOutlineArray[i] = FeatureOutlineIndex[i];
2077
NormFeature = NewFeature(&CharNormDesc);
2078
Baseline = BASELINE_OFFSET;
2079
Scale = MF_SCALE_FACTOR;
2080
NormFeature->Params[CharNormY] = (FXInfo.Ymean - Baseline) * Scale;
2081
NormFeature->Params[CharNormLength] =
2082
FXInfo.Length * Scale / LENGTH_COMPRESSION;
2083
NormFeature->Params[CharNormRx] = FXInfo.Rx * Scale;
2084
NormFeature->Params[CharNormRy] = FXInfo.Ry * Scale;
2085
ComputeCharNormArrays(NormFeature, Templates, CharNormArray, PrunerNormArray);
2086
*BlobLength = FXInfo.NumBL;
2087
return (FXInfo.NumCN);
2088
} /* GetCharNormFeatures */
1754
int Classify::GetCharNormFeature(const INT_FX_RESULT_STRUCT& fx_info,
1755
INT_TEMPLATES templates,
1756
uinT8* pruner_norm_array,
1757
uinT8* char_norm_array) {
1758
FEATURE norm_feature = NewFeature(&CharNormDesc);
1759
float baseline = kBlnBaselineOffset;
1760
float scale = MF_SCALE_FACTOR;
1761
norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
1762
norm_feature->Params[CharNormLength] =
1763
fx_info.Length * scale / LENGTH_COMPRESSION;
1764
norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
1765
norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
1766
// Deletes norm_feature.
1767
ComputeCharNormArrays(norm_feature, templates, char_norm_array,
1769
return IntCastRounded(fx_info.Length / kStandardFeatureLength);
1770
} /* GetCharNormFeature */
2090
1772
// Computes the char_norm_array for the unicharset and, if not NULL, the
2091
1773
// pruner_array as appropriate according to the existence of the shape_table.
2556
2236
/*---------------------------------------------------------------------------*/
2558
* This routine compares Blob to both sets of templates
2559
* (adaptive and pre-trained) and then displays debug
2560
* information for the config which matched best.
2562
* @param Blob blob to show best matching config for
2563
* @param denorm normalization/denormalization parameters
2564
* @param ClassId class whose configs are to be searched
2565
* @param shape_id shape index
2566
* @param AdaptiveOn TRUE if adaptive configs are enabled
2567
* @param PreTrainedOn TRUE if pretrained configs are enabled
2568
* @param Results results of match being debugged
2571
* - PreTrainedTemplates built-in training
2572
* - AdaptedTemplates adaptive templates
2573
* - AllProtosOn dummy proto mask
2574
* - AllConfigsOn dummy config mask
2238
* This routine displays debug information for the best config
2239
* of the given shape_id for the given set of features.
2241
* @param shape_id classifier id to work with
2242
* @param features features of the unknown character
2243
* @param num_features Number of features in the features array.
2576
2245
* @note Exceptions: none
2577
2246
* @note History: Fri Mar 22 08:43:52 1991, DSJ, Created.
2579
void Classify::ShowBestMatchFor(TBLOB *Blob,
2580
const DENORM& denorm,
2585
ADAPT_RESULTS *Results) {
2586
int NumCNFeatures = 0, NumBLFeatures = 0;
2587
INT_FEATURE_ARRAY CNFeatures, BLFeatures;
2588
INT_RESULT_STRUCT CNResult, BLResult;
2591
static int next_config = -1;
2593
if (PreTrainedOn) next_config = -1;
2595
CNResult.Rating = BLResult.Rating = 2.0;
2597
if (!LegalClassId (ClassId)) {
2598
cprintf ("%d is not a legal class id!!\n", ClassId);
2602
uinT8 *CNAdjust = new uinT8[MAX_NUM_CLASSES];
2603
uinT8 *BLAdjust = new uinT8[MAX_NUM_CLASSES];
2605
if (shape_table_ == NULL)
2608
shape_id = ShapeIDToClassID(shape_id);
2609
if (PreTrainedOn && shape_id >= 0) {
2610
if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2611
tprintf("No built-in templates for class/shape %d\n", shape_id);
2613
NumCNFeatures = GetCharNormFeatures(Blob, denorm, PreTrainedTemplates,
2614
CNFeatures, NULL, CNAdjust,
2616
if (NumCNFeatures <= 0) {
2617
tprintf("Illegal blob (char norm features)!\n");
2619
im_.SetCharNormMatch(classify_integer_matcher_multiplier);
2620
im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
2621
AllProtosOn, AllConfigsOn,
2622
NumCNFeatures, CNFeatures,
2624
classify_adapt_feature_threshold, NO_DEBUG,
2625
matcher_debug_separate_windows);
2626
ExpandShapesAndApplyCorrections(NULL, false, shape_id,
2627
Blob->bounding_box().bottom(),
2628
Blob->bounding_box().top(),
2629
0, BlobLength, CNAdjust,
2636
if (ClassId < 0 || ClassId >= AdaptedTemplates->Templates->NumClasses) {
2637
tprintf("Invalid adapted class id: %d\n", ClassId);
2638
} else if (UnusedClassIdIn(AdaptedTemplates->Templates, ClassId) ||
2639
AdaptedTemplates->Class[ClassId] == NULL ||
2640
IsEmptyAdaptedClass(AdaptedTemplates->Class[ClassId])) {
2641
tprintf("No AD templates for class %d = %s\n",
2642
ClassId, unicharset.id_to_unichar(ClassId));
2644
NumBLFeatures = GetBaselineFeatures(Blob,
2646
AdaptedTemplates->Templates,
2647
BLFeatures, BLAdjust,
2649
if (NumBLFeatures <= 0)
2650
tprintf("Illegal blob (baseline features)!\n");
2652
im_.SetBaseLineMatch();
2653
im_.Match(ClassForClassId(AdaptedTemplates->Templates, ClassId),
2654
AllProtosOn, AllConfigsOn,
2655
NumBLFeatures, BLFeatures,
2657
classify_adapt_feature_threshold, NO_DEBUG,
2658
matcher_debug_separate_windows);
2659
ExpandShapesAndApplyCorrections(
2660
AdaptedTemplates->Class, false,
2661
ClassId, Blob->bounding_box().bottom(),
2662
Blob->bounding_box().top(), 0, BlobLength, CNAdjust,
2249
void Classify::ShowBestMatchFor(int shape_id,
2250
const INT_FEATURE_STRUCT* features,
2252
#ifndef GRAPHICS_DISABLED
2254
if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2255
tprintf("No built-in templates for class/shape %d\n", shape_id);
2258
if (num_features <= 0) {
2259
tprintf("Illegal blob (char norm features)!\n");
2262
INT_RESULT_STRUCT cn_result;
2263
classify_norm_method.set_value(character);
2264
im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
2265
AllProtosOn, AllConfigsOn,
2266
num_features, features, &cn_result,
2267
classify_adapt_feature_threshold, NO_DEBUG,
2268
matcher_debug_separate_windows);
2669
if (BLResult.Rating < CNResult.Rating) {
2670
if (next_config < 0) {
2671
ConfigMask = 1 << BLResult.Config;
2674
ConfigMask = 1 << next_config;
2677
classify_norm_method.set_value(baseline);
2679
im_.SetBaseLineMatch();
2680
tprintf("Adaptive Class ID: %d\n", ClassId);
2681
im_.Match(ClassForClassId(AdaptedTemplates->Templates, ClassId),
2682
AllProtosOn, (BIT_VECTOR) &ConfigMask,
2683
NumBLFeatures, BLFeatures,
2685
classify_adapt_feature_threshold,
2686
matcher_debug_flags,
2687
matcher_debug_separate_windows);
2688
ExpandShapesAndApplyCorrections(
2689
AdaptedTemplates->Class, true,
2690
ClassId, Blob->bounding_box().bottom(),
2691
Blob->bounding_box().top(), 0, BlobLength, CNAdjust,
2693
} else if (shape_id >= 0) {
2694
ConfigMask = 1 << CNResult.Config;
2695
classify_norm_method.set_value(character);
2697
tprintf("Static Shape ID: %d\n", shape_id);
2698
im_.SetCharNormMatch(classify_integer_matcher_multiplier);
2699
im_.Match(ClassForClassId (PreTrainedTemplates, shape_id),
2700
AllProtosOn, (BIT_VECTOR) & ConfigMask,
2701
NumCNFeatures, CNFeatures,
2703
classify_adapt_feature_threshold,
2704
matcher_debug_flags,
2705
matcher_debug_separate_windows);
2706
ExpandShapesAndApplyCorrections(NULL, true, shape_id,
2707
Blob->bounding_box().bottom(),
2708
Blob->bounding_box().top(),
2709
0, BlobLength, CNAdjust,
2270
config_mask = 1 << cn_result.Config;
2272
tprintf("Static Shape ID: %d\n", shape_id);
2274
im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
2275
AllProtosOn, reinterpret_cast<BIT_VECTOR>(&config_mask),
2276
num_features, features, &cn_result,
2277
classify_adapt_feature_threshold,
2278
matcher_debug_flags,
2279
matcher_debug_separate_windows);
2280
UpdateMatchDisplay();
2281
#endif // GRAPHICS_DISABLED
2716
2282
} /* ShowBestMatchFor */
2718
2284
// Returns a string for the classifier class_id: either the corresponding