~vcs-imports/tesseract-ocr/trunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File:        shapetable.h
// Description: Class to map a classifier shape index to unicharset
//              indices and font indices.
// Author:      Ray Smith
// Created:     Thu Oct 28 17:46:32 PDT 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
#define TESSERACT_CLASSIFY_SHAPETABLE_H_

#include "bitvector.h"
#include "genericheap.h"
#include "genericvector.h"
#include "intmatcher.h"

class STRING;
class UNICHARSET;

namespace tesseract {

struct FontInfo;
class FontInfoTable;
class ShapeTable;

// Simple struct to hold a single classifier unichar selection, a corresponding
// rating, and a list of appropriate fonts.
struct UnicharRating {
  UnicharRating() : unichar_id(0), rating(0.0f) {}
  UnicharRating(int u, float r)
    : unichar_id(u), rating(r) {}

  // Sort function to sort ratings appropriately by descending rating.
  static int SortDescendingRating(const void* t1, const void* t2) {
    const UnicharRating* a = reinterpret_cast<const UnicharRating *>(t1);
    const UnicharRating* b = reinterpret_cast<const UnicharRating *>(t2);
    if (a->rating > b->rating) {
      return -1;
    } else if (a->rating < b->rating) {
      return 1;
    } else {
      return a->unichar_id - b->unichar_id;
    }
  }
  // Helper function to get the index of the first result with the required
  // unichar_id. If the results are sorted by rating, this will also be the
  // best result with the required unichar_id.
  // Returns -1 if the unichar_id is not found
  static int FirstResultWithUnichar(const GenericVector<UnicharRating>& results,
                                    UNICHAR_ID unichar_id);

  // Index into some UNICHARSET table indicates the class of the answer.
  UNICHAR_ID unichar_id;
  // Rating from classifier with 1.0 perfect and 0.0 impossible.
  // Call it a probability if you must.
  float rating;
  // Set of fonts for this shape in order of decreasing preference.
  // (There is no mechanism for storing scores for fonts as yet.)
  GenericVector<int> fonts;
};

// Classifier result from a low-level classification is an index into some
// ShapeTable and a rating.
struct ShapeRating {
  ShapeRating()
    : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f),
      joined(false), broken(false) {}
  ShapeRating(int s, float r)
    : shape_id(s), rating(r), raw(1.0f), font(0.0f),
      joined(false), broken(false) {}

  // Sort function to sort ratings appropriately by descending rating.
  static int SortDescendingRating(const void* t1, const void* t2) {
    const ShapeRating* a = reinterpret_cast<const ShapeRating *>(t1);
    const ShapeRating* b = reinterpret_cast<const ShapeRating *>(t2);
    if (a->rating > b->rating) {
      return -1;
    } else if (a->rating < b->rating) {
      return 1;
    } else {
      return a->shape_id - b->shape_id;
    }
  }
  // Helper function to get the index of the first result with the required
  // unichar_id. If the results are sorted by rating, this will also be the
  // best result with the required unichar_id.
  // Returns -1 if the unichar_id is not found
  static int FirstResultWithUnichar(const GenericVector<ShapeRating>& results,
                                    const ShapeTable& shape_table,
                                    UNICHAR_ID unichar_id);

  // Index into some shape table indicates the class of the answer.
  int shape_id;
  // Rating from classifier with 1.0 perfect and 0.0 impossible.
  // Call it a probability if you must.
  float rating;
  // Subsidiary rating that a classifier may use internally.
  float raw;
  // Subsidiary rating that a classifier may use internally.
  float font;
  // Flag indicating that the input may be joined.
  bool joined;
  // Flag indicating that the input may be broken (a fragment).
  bool broken;
};

// Simple struct to hold an entry for a heap-based priority queue of
// ShapeRating.
struct ShapeQueueEntry {
  ShapeQueueEntry() : result(ShapeRating(0, 0.0f)), level(0) {}
  ShapeQueueEntry(const ShapeRating& rating, int level0)
    : result(rating), level(level0) {}

  // Sort by decreasing rating and decreasing level for equal rating.
  bool operator<(const ShapeQueueEntry& other) const {
    if (result.rating > other.result.rating) return true;
    if (result.rating == other.result.rating)
      return level > other.level;
    return false;
  }

  // Output from classifier.
  ShapeRating result;
  // Which level in the tree did this come from?
  int level;
};
typedef GenericHeap<ShapeQueueEntry> ShapeQueue;

// Simple struct to hold a set of fonts associated with a single unichar-id.
// A vector of UnicharAndFonts makes a shape.
struct UnicharAndFonts {
  UnicharAndFonts() : unichar_id(0) {
  }
  UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
    font_ids.push_back(font_id);
  }

  // Writes to the given file. Returns false in case of error.
  bool Serialize(FILE* fp) const;
  // Reads from the given file. Returns false in case of error.
  // If swap is true, assumes a big/little-endian swap is needed.
  bool DeSerialize(bool swap, FILE* fp);

  // Sort function to sort a pair of UnicharAndFonts by unichar_id.
  static int SortByUnicharId(const void* v1, const void* v2);

  GenericVector<inT32> font_ids;
  inT32 unichar_id;
};

// A Shape is a collection of unichar-ids and a list of fonts associated with
// each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
// a classifiable unit, and represents a group of characters or parts of
// characters that have a similar or identical shape. Shapes/ShapeTables may
// be organized hierarchically from identical shapes at the leaves to vaguely
// similar shapes near the root.
class Shape {
 public:
  Shape() : destination_index_(-1) {}

  // Writes to the given file. Returns false in case of error.
  bool Serialize(FILE* fp) const;
  // Reads from the given file. Returns false in case of error.
  // If swap is true, assumes a big/little-endian swap is needed.
  bool DeSerialize(bool swap, FILE* fp);

  int destination_index() const {
    return destination_index_;
  }
  void set_destination_index(int index) {
    destination_index_ = index;
  }
  int size() const {
    return unichars_.size();
  }
  // Returns a UnicharAndFonts entry for the given index, which must be
  // in the range [0, size()).
  const UnicharAndFonts& operator[](int index) const {
    return unichars_[index];
  }
  // Sets the unichar_id of the given index to the new unichar_id.
  void SetUnicharId(int index, int unichar_id) {
    unichars_[index].unichar_id = unichar_id;
  }
  // Adds a font_id for the given unichar_id. If the unichar_id is not
  // in the shape, it is added.
  void AddToShape(int unichar_id, int font_id);
  // Adds everything in other to this.
  void AddShape(const Shape& other);
  // Returns true if the shape contains the given unichar_id, font_id pair.
  bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
  // Returns true if the shape contains the given unichar_id, ignoring font.
  bool ContainsUnichar(int unichar_id) const;
  // Returns true if the shape contains the given font, ignoring unichar_id.
  bool ContainsFont(int font_id) const;
  // Returns true if the shape contains the given font properties, ignoring
  // unichar_id.
  bool ContainsFontProperties(const FontInfoTable& font_table,
                              uinT32 properties) const;
  // Returns true if the shape contains multiple different font properties,
  // ignoring unichar_id.
  bool ContainsMultipleFontProperties(const FontInfoTable& font_table) const;
  // Returns true if this shape is equal to other (ignoring order of unichars
  // and fonts).
  bool operator==(const Shape& other) const;
  // Returns true if this is a subset (including equal) of other.
  bool IsSubsetOf(const Shape& other) const;
  // Returns true if the lists of unichar ids are the same in this and other,
  // ignoring fonts.
  // NOT const, as it will sort the unichars on demand.
  bool IsEqualUnichars(Shape* other);

 private:
  // Sorts the unichars_ vector by unichar.
  void SortUnichars();

  // Flag indicates that the unichars are sorted, allowing faster set
  // operations with another shape.
  bool unichars_sorted_;
  // If this Shape is part of a ShapeTable the destiation_index_ is the index
  // of some other shape in the ShapeTable with which this shape is merged.
  int destination_index_;
  // Array of unichars, each with a set of fonts. Each unichar has at most
  // one entry in the vector.
  GenericVector<UnicharAndFonts> unichars_;
};

// ShapeTable is a class to encapsulate the triple indirection that is
// used here.
// ShapeTable is a vector of shapes.
// Each shape is a vector of UnicharAndFonts representing the set of unichars
// that the shape represents.
// Each UnicharAndFonts also lists the fonts of the unichar_id that were
// mapped to the shape during training.
class ShapeTable {
 public:
  ShapeTable();
  // The UNICHARSET reference supplied here, or in set_unicharset below must
  // exist for the entire life of the ShapeTable. It is used only by DebugStr.
  explicit ShapeTable(const UNICHARSET& unicharset);

  // Writes to the given file. Returns false in case of error.
  bool Serialize(FILE* fp) const;
  // Reads from the given file. Returns false in case of error.
  // If swap is true, assumes a big/little-endian swap is needed.
  bool DeSerialize(bool swap, FILE* fp);

  // Accessors.
  int NumShapes() const {
    return shape_table_.size();
  }
  const UNICHARSET& unicharset() const {
    return *unicharset_;
  }
  // Returns the number of fonts used in this ShapeTable, computing it if
  // necessary.
  int NumFonts() const;
  // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
  // entire life of the ShapeTable.
  void set_unicharset(const UNICHARSET& unicharset) {
    unicharset_ = &unicharset;
  }
  // Re-indexes the class_ids in the shapetable according to the given map.
  // Useful in conjunction with set_unicharset.
  void ReMapClassIds(const GenericVector<int>& unicharset_map);
  // Returns a string listing the classes/fonts in a shape.
  STRING DebugStr(int shape_id) const;
  // Returns a debug string summarizing the table.
  STRING SummaryStr() const;

  // Adds a new shape starting with the given unichar_id and font_id.
  // Returns the assigned index.
  int AddShape(int unichar_id, int font_id);
  // Adds a copy of the given shape unless it is already present.
  // Returns the assigned index or index of existing shape if already present.
  int AddShape(const Shape& other);
  // Removes the shape given by the shape index. All indices above are changed!
  void DeleteShape(int shape_id);
  // Adds a font_id to the given existing shape index for the given
  // unichar_id. If the unichar_id is not in the shape, it is added.
  void AddToShape(int shape_id, int unichar_id, int font_id);
  // Adds the given shape to the existing shape with the given index.
  void AddShapeToShape(int shape_id, const Shape& other);
  // Returns the id of the shape that contains the given unichar and font.
  // If not found, returns -1.
  // If font_id < 0, the font_id is ignored and the first shape that matches
  // the unichar_id is returned.
  int FindShape(int unichar_id, int font_id) const;
  // Returns the first unichar_id and font_id in the given shape.
  void GetFirstUnicharAndFont(int shape_id,
                              int* unichar_id, int* font_id) const;

  // Accessors for the Shape with the given shape_id.
  const Shape& GetShape(int shape_id) const {
    return *shape_table_[shape_id];
  }
  Shape* MutableShape(int shape_id) {
    return shape_table_[shape_id];
  }

  // Expands all the classes/fonts in the shape individually to build
  // a ShapeTable.
  int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes);

  // Returns true if the shapes are already merged.
  bool AlreadyMerged(int shape_id1, int shape_id2) const;
  // Returns true if any shape contains multiple unichars.
  bool AnyMultipleUnichars() const;
  // Returns the maximum number of unichars over all shapes.
  int MaxNumUnichars() const;
  // Merges shapes with a common unichar over the [start, end) interval.
  // Assumes single unichar per shape.
  void ForceFontMerges(int start, int end);
  // Returns the number of unichars in the master shape.
  int MasterUnicharCount(int shape_id) const;
  // Returns the sum of the font counts in the master shape.
  int MasterFontCount(int shape_id) const;
  // Returns the number of unichars that would result from merging the shapes.
  int MergedUnicharCount(int shape_id1, int shape_id2) const;
  // Merges two shape_ids, leaving shape_id2 marked as merged.
  void MergeShapes(int shape_id1, int shape_id2);
  // Swaps two shape_ids.
  void SwapShapes(int shape_id1, int shape_id2);
  // Appends the master shapes from other to this.
  // Used to create a clean ShapeTable from a merged one, or to create a
  // copy of a ShapeTable.
  // If not NULL, shape_map is set to map other shape_ids to this's shape_ids.
  void AppendMasterShapes(const ShapeTable& other,
                          GenericVector<int>* shape_map);
  // Returns the number of master shapes remaining after merging.
  int NumMasterShapes() const;
  // Returns the destination of this shape, (if merged), taking into account
  // the fact that the destination may itself have been merged.
  // For a non-merged shape, returns the input shape_id.
  int MasterDestinationIndex(int shape_id) const;

  // Returns false if the unichars in neither shape is a subset of the other..
  bool SubsetUnichar(int shape_id1, int shape_id2) const;
  // Returns false if the unichars in neither shape is a subset of the other..
  bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const;
  // Returns true if the unichar sets are equal between the shapes.
  bool EqualUnichars(int shape_id1, int shape_id2) const;
  bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const;
  // Returns true if there is a common unichar between the shapes.
  bool CommonUnichars(int shape_id1, int shape_id2) const;
  // Returns true if there is a common font id between the shapes.
  bool CommonFont(int shape_id1, int shape_id2) const;

  // Adds the unichars of the given shape_id to the vector of results. Any
  // unichar_id that is already present just has the fonts added to the
  // font set for that result without adding a new entry in the vector.
  // NOTE: it is assumed that the results are given to this function in order
  // of decreasing rating.
  // The unichar_map vector indicates the index of the results entry containing
  // each unichar, or -1 if the unichar is not yet included in results.
  void AddShapeToResults(const ShapeRating& shape_rating,
                         GenericVector<int>* unichar_map,
                         GenericVector<UnicharRating>* results) const;

 private:
  // Adds the given unichar_id to the results if needed, updating unichar_map
  // and returning the index of unichar in results.
  int AddUnicharToResults(int unichar_id, float rating,
                          GenericVector<int>* unichar_map,
                          GenericVector<UnicharRating>* results) const;

  // Pointer to a provided unicharset used only by the Debugstr member.
  const UNICHARSET* unicharset_;
  // Vector of pointers to the Shapes in this ShapeTable.
  PointerVector<Shape> shape_table_;

  // Cached data calculated on demand.
  mutable int num_fonts_;
};

}  // namespace tesseract.

#endif  // TESSERACT_CLASSIFY_SHAPETABLE_H_