1
//========================================================================
5
// Copyright 1997-2003 Glyph & Cog, LLC
7
//========================================================================
9
#ifndef TEXTOUTPUTDEV_H
10
#define TEXTOUTPUTDEV_H
14
#ifdef USE_GCC_PRAGMAS
21
#include "OutputDev.h"
39
//------------------------------------------------------------------------
41
typedef void (*TextOutputFunc)(void *stream, char *text, int len);
43
//------------------------------------------------------------------------
45
//------------------------------------------------------------------------
50
TextFontInfo(GfxState *state);
53
GBool matches(GfxState *state);
56
// Get the font name (which may be NULL).
57
GString *getFontName() { return fontName; }
59
// Get font descriptor flags.
60
GBool isFixedWidth() { return flags & fontFixedWidth; }
61
GBool isSerif() { return flags & fontSerif; }
62
GBool isSymbolic() { return flags & fontSymbolic; }
63
GBool isItalic() { return flags & fontItalic; }
64
GBool isBold() { return flags & fontBold; }
75
friend class TextWord;
76
friend class TextPage;
79
//------------------------------------------------------------------------
81
//------------------------------------------------------------------------
87
TextWord(GfxState *state, int rotA, double x0, double y0,
88
int charPosA, TextFontInfo *fontA, double fontSize);
93
// Add a character to the word.
94
void addChar(GfxState *state, double x, double y,
95
double dx, double dy, Unicode u);
97
// Merge <word> onto the end of <this>.
98
void merge(TextWord *word);
100
// Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
101
// based on a primary-axis comparison, e.g., x ordering if rot=0.
102
int primaryCmp(TextWord *word);
104
// Return the distance along the primary axis between <this> and
106
double primaryDelta(TextWord *word);
108
static int cmpYX(const void *p1, const void *p2);
110
// Get the TextFontInfo object associated with this word.
111
TextFontInfo *getFontInfo() { return font; }
113
// Get the next TextWord on the linked list.
114
TextWord *getNext() { return next; }
116
#if TEXTOUT_WORD_LIST
117
int getLength() { return len; }
118
Unicode getChar(int idx) { return text[idx]; }
120
GString *getFontName() { return font->fontName; }
121
void getColor(double *r, double *g, double *b)
122
{ *r = colorR; *g = colorG; *b = colorB; }
123
void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
124
{ *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; }
125
void getCharBBox(int charIdx, double *xMinA, double *yMinA,
126
double *xMaxA, double *yMaxA);
127
double getFontSize() { return fontSize; }
128
int getRotation() { return rot; }
129
int getCharPos() { return charPos; }
130
int getCharLen() { return charLen; }
131
GBool getSpaceAfter() { return spaceAfter; }
134
GBool isUnderlined() { return underlined; }
135
Link *getLink() { return link; }
139
int rot; // rotation, multiple of 90 degrees
141
double xMin, xMax; // bounding box x coordinates
142
double yMin, yMax; // bounding box y coordinates
143
double base; // baseline x or y coordinate
144
Unicode *text; // the text
145
double *edge; // "near" edge x or y coord of each char
146
// (plus one extra entry for the last char)
147
int len; // length of text and edge arrays
148
int size; // size of text and edge arrays
149
int charPos; // character position (within content stream)
150
int charLen; // number of content stream characters in
152
TextFontInfo *font; // font information
153
double fontSize; // font size
154
GBool spaceAfter; // set if there is a space between this
155
// word and the next word on the line
156
TextWord *next; // next word in line
158
#if TEXTOUT_WORD_LIST
159
double colorR, // word color
167
friend class TextPool;
168
friend class TextLine;
169
friend class TextBlock;
170
friend class TextFlow;
171
friend class TextWordList;
172
friend class TextPage;
175
//------------------------------------------------------------------------
177
//------------------------------------------------------------------------
185
TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; }
186
void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; }
188
int getBaseIdx(double base);
190
void addWord(TextWord *word);
194
int minBaseIdx; // min baseline bucket index
195
int maxBaseIdx; // max baseline bucket index
196
TextWord **pool; // array of linked lists, one for each
197
// baseline value (multiple of 4 pts)
198
TextWord *cursor; // pointer to last-accessed word
199
int cursorBaseIdx; // baseline bucket index of last-accessed word
201
friend class TextBlock;
202
friend class TextPage;
205
//------------------------------------------------------------------------
207
//------------------------------------------------------------------------
212
TextLine(TextBlock *blkA, int rotA, double baseA);
215
void addWord(TextWord *word);
217
// Return the distance along the primary axis between <this> and
219
double primaryDelta(TextLine *line);
221
// Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
222
// based on a primary-axis comparison, e.g., x ordering if rot=0.
223
int primaryCmp(TextLine *line);
225
// Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
226
// based on a secondary-axis comparison of the baselines, e.g., y
227
// ordering if rot=0.
228
int secondaryCmp(TextLine *line);
230
int cmpYX(TextLine *line);
232
static int cmpXY(const void *p1, const void *p2);
234
void coalesce(UnicodeMap *uMap);
236
// Get the head of the linked list of TextWords.
237
TextWord *getWords() { return words; }
239
// Get the next TextLine on the linked list.
240
TextLine *getNext() { return next; }
242
// Returns true if the last char of the line is a hyphen.
243
GBool isHyphenated() { return hyphenated; }
247
TextBlock *blk; // parent block
248
int rot; // text rotation
249
double xMin, xMax; // bounding box x coordinates
250
double yMin, yMax; // bounding box y coordinates
251
double base; // baseline x or y coordinate
252
TextWord *words; // words in this line
253
TextWord *lastWord; // last word in this line
254
Unicode *text; // Unicode text of the line, including
255
// spaces between words
256
double *edge; // "near" edge x or y coord of each char
257
// (plus one extra entry for the last char)
258
int *col; // starting column number of each Unicode char
259
int len; // number of Unicode chars
260
int convertedLen; // total number of converted characters
261
GBool hyphenated; // set if last char is a hyphen
262
TextLine *next; // next line in block
264
friend class TextLineFrag;
265
friend class TextBlock;
266
friend class TextFlow;
267
friend class TextWordList;
268
friend class TextPage;
271
//------------------------------------------------------------------------
273
//------------------------------------------------------------------------
278
TextBlock(TextPage *pageA, int rotA);
281
void addWord(TextWord *word);
283
void coalesce(UnicodeMap *uMap);
285
// Update this block's priMin and priMax values, looking at <blk>.
286
void updatePriMinMax(TextBlock *blk);
288
static int cmpXYPrimaryRot(const void *p1, const void *p2);
290
static int cmpYXPrimaryRot(const void *p1, const void *p2);
292
int primaryCmp(TextBlock *blk);
294
double secondaryDelta(TextBlock *blk);
296
// Returns true if <this> is below <blk>, relative to the page's
298
GBool isBelow(TextBlock *blk);
300
// Get the head of the linked list of TextLines.
301
TextLine *getLines() { return lines; }
303
// Get the next TextBlock on the linked list.
304
TextBlock *getNext() { return next; }
308
TextPage *page; // the parent page
309
int rot; // text rotation
310
double xMin, xMax; // bounding box x coordinates
311
double yMin, yMax; // bounding box y coordinates
312
double priMin, priMax; // whitespace bounding box along primary axis
314
TextPool *pool; // pool of words (used only until lines
316
TextLine *lines; // linked list of lines
317
TextLine *curLine; // most recently added line
318
int nLines; // number of lines
319
int charCount; // number of characters in the block
320
int col; // starting column
321
int nColumns; // number of columns in the block
324
TextBlock *stackNext;
326
friend class TextLine;
327
friend class TextLineFrag;
328
friend class TextFlow;
329
friend class TextWordList;
330
friend class TextPage;
333
//------------------------------------------------------------------------
335
//------------------------------------------------------------------------
340
TextFlow(TextPage *pageA, TextBlock *blk);
343
// Add a block to the end of this flow.
344
void addBlock(TextBlock *blk);
346
// Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
347
// it uses a font no larger than the last block added to the flow,
348
// and (2) it fits within the flow's [priMin, priMax] along the
350
GBool blockFits(TextBlock *blk, TextBlock *prevBlk);
352
// Get the head of the linked list of TextBlocks.
353
TextBlock *getBlocks() { return blocks; }
355
// Get the next TextFlow on the linked list.
356
TextFlow *getNext() { return next; }
360
TextPage *page; // the parent page
361
double xMin, xMax; // bounding box x coordinates
362
double yMin, yMax; // bounding box y coordinates
363
double priMin, priMax; // whitespace bounding box along primary axis
364
TextBlock *blocks; // blocks in flow
365
TextBlock *lastBlk; // last block in this flow
368
friend class TextWordList;
369
friend class TextPage;
372
#if TEXTOUT_WORD_LIST
374
//------------------------------------------------------------------------
376
//------------------------------------------------------------------------
381
// Build a flat word list, in content stream order (if
382
// text->rawOrder is true), physical layout order (if <physLayout>
383
// is true and text->rawOrder is false), or reading order (if both
385
TextWordList(TextPage *text, GBool physLayout);
389
// Return the number of words on the list.
392
// Return the <idx>th word from the list.
393
TextWord *get(int idx);
397
GList *words; // [TextWord]
400
#endif // TEXTOUT_WORD_LIST
402
//------------------------------------------------------------------------
404
//------------------------------------------------------------------------
410
TextPage(GBool rawOrderA);
416
void startPage(GfxState *state);
418
// End the current page.
421
// Update the current font.
422
void updateFont(GfxState *state);
425
void beginWord(GfxState *state, double x0, double y0);
427
// Add a character to the current word.
428
void addChar(GfxState *state, double x, double y,
429
double dx, double dy,
430
CharCode c, int nBytes, Unicode *u, int uLen);
432
// End the current word, sorting it into the list of words.
435
// Add a word, sorting it into the list of words.
436
void addWord(TextWord *word);
438
// Add a (potential) underline.
439
void addUnderline(double x0, double y0, double x1, double y1);
442
void addLink(int xMin, int yMin, int xMax, int yMax, Link *link);
444
// Coalesce strings that look like parts of the same line.
445
void coalesce(GBool physLayout, GBool doHTML);
447
// Find a string. If <startAtTop> is true, starts looking at the
448
// top of the page; else if <startAtLast> is true, starts looking
449
// immediately after the last find result; else starts looking at
450
// <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
451
// bottom of the page; else if <stopAtLast> is true, stops looking
452
// just before the last find result; else stops looking at
454
GBool findText(Unicode *s, int len,
455
GBool startAtTop, GBool stopAtBottom,
456
GBool startAtLast, GBool stopAtLast,
457
GBool caseSensitive, GBool backward,
458
double *xMin, double *yMin,
459
double *xMax, double *yMax);
461
// Get the text which is inside the specified rectangle.
462
GString *getText(double xMin, double yMin,
463
double xMax, double yMax);
465
// Find a string by character position and length. If found, sets
466
// the text bounding rectangle and returns true; otherwise returns
468
GBool findCharRange(int pos, int length,
469
double *xMin, double *yMin,
470
double *xMax, double *yMax);
472
// Dump contents of page to a file.
473
void dump(void *outputStream, TextOutputFunc outputFunc,
476
// Get the head of the linked list of TextFlows.
477
TextFlow *getFlows() { return flows; }
479
#if TEXTOUT_WORD_LIST
480
// Build a flat word list, in content stream order (if
481
// this->rawOrder is true), physical layout order (if <physLayout>
482
// is true and this->rawOrder is false), or reading order (if both
484
TextWordList *makeWordList(GBool physLayout);
490
void assignColumns(TextLineFrag *frags, int nFrags, int rot);
491
int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GString *s);
493
GBool rawOrder; // keep text in content stream order
495
double pageWidth, pageHeight; // width and height of current page
496
TextWord *curWord; // currently active string
497
int charPos; // next character position (within content
499
TextFontInfo *curFont; // current font
500
double curFontSize; // current font size
501
int nest; // current nesting level (for Type 3 fonts)
502
int nTinyChars; // number of "tiny" chars seen so far
503
GBool lastCharOverlap; // set if the last added char overlapped the
506
TextPool *pools[4]; // a "pool" of TextWords for each rotation
507
TextFlow *flows; // linked list of flows
508
TextBlock **blocks; // array of blocks, in yx order
509
int nBlocks; // number of blocks
510
int primaryRot; // primary rotation
511
GBool primaryLR; // primary direction (true means L-to-R,
512
// false means R-to-L)
513
TextWord *rawWords; // list of words, in raw order (only if
515
TextWord *rawLastWord; // last word on rawWords list
517
GList *fonts; // all font info objects used on this
518
// page [TextFontInfo]
520
double lastFindXMin, // coordinates of the last "find" result
524
GList *underlines; // [TextUnderline]
525
GList *links; // [TextLink]
527
friend class TextLine;
528
friend class TextLineFrag;
529
friend class TextBlock;
530
friend class TextFlow;
531
friend class TextWordList;
534
//------------------------------------------------------------------------
536
//------------------------------------------------------------------------
538
class TextOutputDev: public OutputDev {
541
// Open a text output file. If <fileName> is NULL, no file is
542
// written (this is useful, e.g., for searching text). If
543
// <physLayoutA> is true, the original physical layout of the text
544
// is maintained. If <rawOrder> is true, the text is kept in
545
// content stream order.
546
TextOutputDev(char *fileName, GBool physLayoutA,
547
GBool rawOrderA, GBool append);
549
// Create a TextOutputDev which will write to a generic stream. If
550
// <physLayoutA> is true, the original physical layout of the text
551
// is maintained. If <rawOrder> is true, the text is kept in
552
// content stream order.
553
TextOutputDev(TextOutputFunc func, void *stream,
554
GBool physLayoutA, GBool rawOrderA);
557
virtual ~TextOutputDev();
559
// Check if file was successfully created.
560
virtual GBool isOk() { return ok; }
562
//---- get info about output device
564
// Does this device use upside-down coordinates?
565
// (Upside-down means (0,0) is the top left corner of the page.)
566
virtual GBool upsideDown() { return gTrue; }
568
// Does this device use drawChar() or drawString()?
569
virtual GBool useDrawChar() { return gTrue; }
571
// Does this device use beginType3Char/endType3Char? Otherwise,
572
// text in Type 3 fonts will be drawn with drawChar/drawString.
573
virtual GBool interpretType3Chars() { return gFalse; }
575
// Does this device need non-text content?
576
virtual GBool needNonText() { return gFalse; }
578
//----- initialization and control
581
virtual void startPage(int pageNum, GfxState *state);
584
virtual void endPage();
586
//----- update text state
587
virtual void updateFont(GfxState *state);
590
virtual void beginString(GfxState *state, GString *s);
591
virtual void endString(GfxState *state);
592
virtual void drawChar(GfxState *state, double x, double y,
593
double dx, double dy,
594
double originX, double originY,
595
CharCode c, int nBytes, Unicode *u, int uLen);
597
//----- path painting
598
virtual void stroke(GfxState *state);
599
virtual void fill(GfxState *state);
600
virtual void eoFill(GfxState *state);
603
virtual void processLink(Link *link, Catalog *catalog);
605
//----- special access
607
// Find a string. If <startAtTop> is true, starts looking at the
608
// top of the page; else if <startAtLast> is true, starts looking
609
// immediately after the last find result; else starts looking at
610
// <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
611
// bottom of the page; else if <stopAtLast> is true, stops looking
612
// just before the last find result; else stops looking at
614
GBool findText(Unicode *s, int len,
615
GBool startAtTop, GBool stopAtBottom,
616
GBool startAtLast, GBool stopAtLast,
617
GBool caseSensitive, GBool backward,
618
double *xMin, double *yMin,
619
double *xMax, double *yMax);
621
// Get the text which is inside the specified rectangle.
622
GString *getText(double xMin, double yMin,
623
double xMax, double yMax);
625
// Find a string by character position and length. If found, sets
626
// the text bounding rectangle and returns true; otherwise returns
628
GBool findCharRange(int pos, int length,
629
double *xMin, double *yMin,
630
double *xMax, double *yMax);
632
#if TEXTOUT_WORD_LIST
633
// Build a flat word list, in content stream order (if
634
// this->rawOrder is true), physical layout order (if
635
// this->physLayout is true and this->rawOrder is false), or reading
636
// order (if both flags are false).
637
TextWordList *makeWordList();
640
// Returns the TextPage object for the last rasterized page,
641
// transferring ownership to the caller.
642
TextPage *takeText();
644
// Turn extra processing for HTML conversion on or off.
645
void enableHTMLExtras(GBool doHTMLA) { doHTML = doHTMLA; }
649
TextOutputFunc outputFunc; // output function
650
void *outputStream; // output stream
651
GBool needClose; // need to close the output file?
652
// (only if outputStream is a FILE*)
653
TextPage *text; // text for the current page
654
GBool physLayout; // maintain original physical layout when
656
GBool rawOrder; // keep text in content stream order
657
GBool doHTML; // extra processing for HTML conversion
658
GBool ok; // set up ok?