1
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
2
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
3
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
5
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
6
<meta name="generator" content="AsciiDoc 8.5.2" />
7
<title>UNICHARSET_EXTRACTOR(1)</title>
8
<style type="text/css">
10
p, li, dt, dd, div, pre, h1, h2, h3, h4, h5, h6 {
12
border: 1px solid red;
17
margin: 1em 5% 1em 5%;
22
text-decoration: underline;
42
h1, h2, h3, h4, h5, h6 {
44
font-family: sans-serif;
51
border-bottom: 2px solid silver;
69
border: 1px solid silver;
88
font-family: sans-serif;
94
span#revnumber, span#revdate, span#revremark {
95
font-family: sans-serif;
99
font-family: sans-serif;
101
border-top: 2px solid silver;
107
padding-bottom: 0.5em;
111
padding-bottom: 0.5em;
116
margin-bottom: 1.5em;
118
div.tableblock, div.imageblock, div.exampleblock, div.verseblock,
119
div.quoteblock, div.literalblock, div.listingblock, div.sidebarblock,
120
div.admonitionblock {
122
margin-bottom: 1.5em;
124
div.admonitionblock {
126
margin-bottom: 2.0em;
131
div.content { /* Block element content. */
135
/* Block element titles. */
136
div.title, caption.title {
138
font-family: sans-serif;
142
margin-bottom: 0.5em;
148
td div.title:first-child {
151
div.content div.title:first-child {
154
div.content + div.title {
158
div.sidebarblock > div.content {
160
border: 1px solid silver;
164
div.listingblock > div.content {
165
border: 1px solid silver;
170
div.quoteblock, div.verseblock {
174
border-left: 5px solid #dddddd;
178
div.quoteblock > div.attribution {
183
div.verseblock > div.content {
186
div.verseblock > div.attribution {
190
/* DEPRECATED: Pre version 8.2.7 verse style literal block. */
191
div.verseblock + div.attribution {
195
div.admonitionblock .icon {
199
text-decoration: underline;
201
padding-right: 0.5em;
203
div.admonitionblock td.content {
205
border-left: 3px solid #dddddd;
208
div.exampleblock > div.content {
209
border-left: 3px solid #dddddd;
213
div.imageblock div.content { padding-left: 0; }
214
span.image img { border-style: none; }
215
a.image:visited { color: white; }
219
margin-bottom: 0.8em;
232
list-style-position: outside;
235
list-style-type: decimal;
238
list-style-type: lower-alpha;
241
list-style-type: upper-alpha;
244
list-style-type: lower-roman;
247
list-style-type: upper-roman;
250
div.compact ul, div.compact ol,
251
div.compact p, div.compact p,
252
div.compact div, div.compact div {
254
margin-bottom: 0.1em;
257
div.tableblock > table {
258
border: 3px solid #527bbd;
260
thead, p.table.header {
261
font-family: sans-serif;
273
/* Because the table frame attribute is overriden by CSS in most browsers. */
274
div.tableblock > table[frame="void"] {
277
div.tableblock > table[frame="hsides"] {
278
border-left-style: none;
279
border-right-style: none;
281
div.tableblock > table[frame="vsides"] {
282
border-top-style: none;
283
border-bottom-style: none;
289
margin-bottom: 0.8em;
292
padding-bottom: 15px;
294
dt.hdlist1.strong, td.hdlist1.strong {
300
padding-right: 0.8em;
306
div.hdlist.compact tr {
315
.footnote, .footnoteref {
319
span.footnote, span.footnoteref {
320
vertical-align: super;
324
margin: 20px 0 20px 0;
328
#footnotes div.footnote {
334
border-top: 1px solid silver;
344
div#footer-badges { display: none; }
348
margin-bottom: 2.5em;
353
font-family: sans-serif;
357
margin-bottom: 0.1em;
360
div.toclevel1, div.toclevel2, div.toclevel3, div.toclevel4 {
376
/* Overrides for manpage documents */
379
padding-bottom: 0.5em;
380
border-top: 2px solid silver;
381
border-bottom: 2px solid silver;
391
div#toc { display: none; }
394
/* Workarounds for IE6's broken and incomplete CSS2. */
396
div.sidebar-content {
398
border: 1px solid silver;
401
div.sidebar-title, div.image-title {
403
font-family: sans-serif;
406
margin-bottom: 0.5em;
409
div.listingblock div.content {
410
border: 1px solid silver;
415
div.quoteblock-attribution {
420
div.verseblock-content {
423
div.verseblock-attribution {
428
div.exampleblock-content {
429
border-left: 3px solid #dddddd;
433
/* IE6 sets dynamically generated links as visited. */
434
div#toc a:visited { color: blue; }
436
<script type="text/javascript">
438
window.onload = function(){asciidoc.footnotes();}
439
var asciidoc = { // Namespace.
441
/////////////////////////////////////////////////////////////////////
442
// Table Of Contents generator
443
/////////////////////////////////////////////////////////////////////
445
/* Author: Mihai Bazon, September 2002
446
* http://students.infoiasi.ro/~mishoo
448
* Table Of Content generator
451
* Feel free to use this script under the terms of the GNU General Public
452
* License, as long as you do not remove or alter this notice.
455
/* modified by Troy D. Hanson, September 2006. License: GPL */
456
/* modified by Stuart Rackham, 2006, 2009. License: GPL */
459
toc: function (toclevels) {
461
function getText(el) {
463
for (var i = el.firstChild; i != null; i = i.nextSibling) {
464
if (i.nodeType == 3 /* Node.TEXT_NODE */) // IE doesn't speak constants.
466
else if (i.firstChild != null)
472
function TocEntry(el, text, toclevel) {
475
this.toclevel = toclevel;
478
function tocEntries(el, toclevels) {
479
var result = new Array;
480
var re = new RegExp('[hH]([2-'+(toclevels+1)+'])');
481
// Function that scans the DOM tree for header elements (the DOM2
482
// nodeIterator API would be a better technique but not supported by all
484
var iterate = function (el) {
485
for (var i = el.firstChild; i != null; i = i.nextSibling) {
486
if (i.nodeType == 1 /* Node.ELEMENT_NODE */) {
487
var mo = re.exec(i.tagName);
488
if (mo && (i.getAttribute("class") || i.getAttribute("className")) != "float") {
489
result[result.length] = new TocEntry(i, getText(i), mo[1]-1);
499
var toc = document.getElementById("toc");
500
var entries = tocEntries(document.getElementById("content"), toclevels);
501
for (var i = 0; i < entries.length; ++i) {
502
var entry = entries[i];
503
if (entry.element.id == "")
504
entry.element.id = "_toc_" + i;
505
var a = document.createElement("a");
506
a.href = "#" + entry.element.id;
507
a.appendChild(document.createTextNode(entry.text));
508
var div = document.createElement("div");
510
div.className = "toclevel" + entry.toclevel;
511
toc.appendChild(div);
513
if (entries.length == 0)
514
toc.parentNode.removeChild(toc);
518
/////////////////////////////////////////////////////////////////////
519
// Footnotes generator
520
/////////////////////////////////////////////////////////////////////
522
/* Based on footnote generation code from:
523
* http://www.brandspankingnew.net/archive/2005/07/format_footnote.html
526
footnotes: function () {
527
var cont = document.getElementById("content");
528
var noteholder = document.getElementById("footnotes");
529
var spans = cont.getElementsByTagName("span");
532
for (i=0; i<spans.length; i++) {
533
if (spans[i].className == "footnote") {
535
// Use [\s\S] in place of . so multi-line matches work.
536
// Because JavaScript has no s (dotall) regex flag.
537
note = spans[i].innerHTML.match(/\s*\[([\s\S]*)]\s*/)[1];
538
noteholder.innerHTML +=
539
"<div class='footnote' id='_footnote_" + n + "'>" +
540
"<a href='#_footnoteref_" + n + "' title='Return to text'>" +
541
n + "</a>. " + note + "</div>";
543
"[<a id='_footnoteref_" + n + "' href='#_footnote_" + n +
544
"' title='View footnote' class='footnote'>" + n + "</a>]";
545
var id =spans[i].getAttribute("id");
546
if (id != null) refs["#"+id] = n;
550
noteholder.parentNode.removeChild(noteholder);
552
// Process footnoterefs.
553
for (i=0; i<spans.length; i++) {
554
if (spans[i].className == "footnoteref") {
555
var href = spans[i].getElementsByTagName("a")[0].getAttribute("href");
556
href = href.match(/#.*/)[0]; // Because IE return full URL.
559
"[<a href='#_footnote_" + n +
560
"' title='View footnote' class='footnote'>" + n + "</a>]";
573
UNICHARSET_EXTRACTOR(1) Manual Page
576
<div class="sectionbody">
577
<p>unicharset_extractor -
578
extract unicharset from Tesseract boxfiles
583
<h2 id="_synopsis">SYNOPSIS</h2>
584
<div class="sectionbody">
585
<div class="paragraph"><p><strong>unicharset_extractor</strong> <em>[-D dir]</em> <em>FILE</em>…</p></div>
587
<h2 id="_description">DESCRIPTION</h2>
588
<div class="sectionbody">
589
<div class="paragraph"><p>Tesseract needs to know the set of possible characters it can output.
590
To generate the unicharset data file, use the unicharset_extractor
591
program on the same training pages bounding box files as used for
592
clustering:</p></div>
593
<div class="literalblock">
594
<div class="content">
595
<pre><tt>unicharset_extractor fontfile_1.box fontfile_2.box ...</tt></pre>
597
<div class="paragraph"><p>The unicharset will be put into the file <em>dir/unicharset</em>, or simply
598
<em>./unicharset</em> if no output directory is provided.</p></div>
599
<div class="paragraph"><p>Tesseract also needs to have access to character properties isalpha,
600
isdigit, isupper, islower, ispunctuation. all of this auxilury data
601
and more is encoded in this file. (See unicharset(5))</p></div>
602
<div class="paragraph"><p>If your system supports the wctype functions, these values will be set
603
automatically by unicharset_extractor and there is no need to edit the
604
unicharset file. On some older systems (eg Windows 95), the unicharset
605
file must be edited by hand to add these property description codes.</p></div>
606
<div class="paragraph"><p><strong>NOTE</strong> The unicharset file must be regenerated whenever inttemp, normproto
607
and pffmtable are generated (i.e. they must all be recreated when the box
608
file is changed) as they have to be in sync. This is made easier than in
609
previous versions by running unicharset_extractor before mftraining and
610
cntraining, and giving the unicharset to mftraining.</p></div>
612
<h2 id="_see_also">SEE ALSO</h2>
613
<div class="sectionbody">
614
<div class="paragraph"><p>tesseract(1), unicharset(5)</p></div>
615
<div class="paragraph"><p><a href="http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3">http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3</a></p></div>
617
<h2 id="_history">HISTORY</h2>
618
<div class="sectionbody">
619
<div class="paragraph"><p>unicharset_extractor first appeared in Tesseract 2.00.</p></div>
621
<h2 id="_copying">COPYING</h2>
622
<div class="sectionbody">
623
<div class="paragraph"><p>Copyright (C) 2006, Google Inc.
624
Licensed under the Apache License, Version 2.0</p></div>
626
<h2 id="_author">AUTHOR</h2>
627
<div class="sectionbody">
628
<div class="paragraph"><p>The Tesseract OCR engine was written by Ray Smith and his research groups
629
at Hewlett Packard (1985-1995) and Google (2006-present).</p></div>
632
<div id="footnotes"><hr /></div>
634
<div id="footer-text">
635
Last updated 2012-02-09 09:19:05 PDT