1
//========================================================================
5
// Copyright 1997-2003 Glyph & Cog, LLC
7
//========================================================================
9
//========================================================================
11
// Modified under the Poppler project - http://poppler.freedesktop.org
13
// Copyright (C) 2005-2007 Kristian HĆøgsberg <krh@redhat.com>
14
// Copyright (C) 2006 Ed Catmur <ed@catmur.co.uk>
15
// Copyright (C) 2007-2008 Carlos Garcia Campos <carlosgc@gnome.org>
16
// Copyright (C) 2007 Adrian Johnson <ajohnson@redneon.com>
17
// Copyright (C) 2008, 2010 Albert Astals Cid <aacid@kde.org>
18
// Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
20
// To see a description of the changes please see the Changelog file that
21
// came with your tarball or type make ChangeLog if you are building from git
23
//========================================================================
25
#ifndef TEXTOUTPUTDEV_H
26
#define TEXTOUTPUTDEV_H
28
#ifdef USE_GCC_PRAGMAS
32
#include "poppler-config.h"
34
#include "goo/gtypes.h"
37
#include "OutputDev.h"
55
class TextSelectionVisitor;
57
//------------------------------------------------------------------------
59
typedef void (*TextOutputFunc)(void *stream, char *text, int len);
67
//------------------------------------------------------------------------
69
//------------------------------------------------------------------------
74
TextFontInfo(GfxState *state);
77
GBool matches(GfxState *state);
80
// Get the font name (which may be NULL).
81
GooString *getFontName() { return fontName; }
83
// Get font descriptor flags.
84
GBool isFixedWidth() { return flags & fontFixedWidth; }
85
GBool isSerif() { return flags & fontSerif; }
86
GBool isSymbolic() { return flags & fontSymbolic; }
87
GBool isItalic() { return flags & fontItalic; }
88
GBool isBold() { return flags & fontBold; }
99
friend class TextWord;
100
friend class TextPage;
101
friend class TextSelectionPainter;
104
//------------------------------------------------------------------------
106
//------------------------------------------------------------------------
112
TextWord(GfxState *state, int rotA, double x0, double y0,
113
int charPosA, TextFontInfo *fontA, double fontSize);
118
// Add a character to the word.
119
void addChar(GfxState *state, double x, double y,
120
double dx, double dy, CharCode c, Unicode u);
122
// Merge <word> onto the end of <this>.
123
void merge(TextWord *word);
125
// Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
126
// based on a primary-axis comparison, e.g., x ordering if rot=0.
127
int primaryCmp(TextWord *word);
129
// Return the distance along the primary axis between <this> and
131
double primaryDelta(TextWord *word);
133
static int cmpYX(const void *p1, const void *p2);
135
void visitSelection(TextSelectionVisitor *visitor,
136
PDFRectangle *selection,
137
SelectionStyle style);
139
// Get the TextFontInfo object associated with this word.
140
TextFontInfo *getFontInfo() { return font; }
142
// Get the next TextWord on the linked list.
143
TextWord *getNext() { return next; }
145
#if TEXTOUT_WORD_LIST
146
int getLength() { return len; }
147
const Unicode *getChar(int idx) { return &text[idx]; }
148
GooString *getText();
149
GooString *getFontName() { return font->fontName; }
150
void getColor(double *r, double *g, double *b)
151
{ *r = colorR; *g = colorG; *b = colorB; }
152
void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
153
{ *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; }
154
void getCharBBox(int charIdx, double *xMinA, double *yMinA,
155
double *xMaxA, double *yMaxA);
156
double getFontSize() { return fontSize; }
157
int getRotation() { return rot; }
158
int getCharPos() { return charPos; }
159
int getCharLen() { return charLen; }
160
GBool getSpaceAfter() { return spaceAfter; }
162
GBool isUnderlined() { return underlined; }
163
Link *getLink() { return link; }
164
double getEdge(int i) { return edge[i]; }
165
double getBaseline () { return base; }
166
GBool hasSpaceAfter () { return spaceAfter; }
167
TextWord* nextWord () { return next; };
170
int rot; // rotation, multiple of 90 degrees
172
double xMin, xMax; // bounding box x coordinates
173
double yMin, yMax; // bounding box y coordinates
174
double base; // baseline x or y coordinate
175
Unicode *text; // the text
176
CharCode *charcode; // glyph indices
177
double *edge; // "near" edge x or y coord of each char
178
// (plus one extra entry for the last char)
179
int len; // length of text and edge arrays
180
int size; // size of text and edge arrays
181
int charPos; // character position (within content stream)
182
int charLen; // number of content stream characters in
184
TextFontInfo *font; // font information
185
double fontSize; // font size
186
GBool spaceAfter; // set if there is a space between this
187
// word and the next word on the line
188
TextWord *next; // next word in line
190
#if TEXTOUT_WORD_LIST
191
double colorR, // word color
199
friend class TextPool;
200
friend class TextLine;
201
friend class TextBlock;
202
friend class TextFlow;
203
friend class TextWordList;
204
friend class TextPage;
206
friend class TextSelectionPainter;
207
friend class TextSelectionDumper;
210
//------------------------------------------------------------------------
212
//------------------------------------------------------------------------
220
TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; }
221
void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; }
223
int getBaseIdx(double base);
225
void addWord(TextWord *word);
229
int minBaseIdx; // min baseline bucket index
230
int maxBaseIdx; // max baseline bucket index
231
TextWord **pool; // array of linked lists, one for each
232
// baseline value (multiple of 4 pts)
233
TextWord *cursor; // pointer to last-accessed word
234
int cursorBaseIdx; // baseline bucket index of last-accessed word
236
friend class TextBlock;
237
friend class TextPage;
242
//------------------------------------------------------------------------
244
//------------------------------------------------------------------------
249
TextLine(TextBlock *blkA, int rotA, double baseA);
252
void addWord(TextWord *word);
254
// Return the distance along the primary axis between <this> and
256
double primaryDelta(TextLine *line);
258
// Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
259
// based on a primary-axis comparison, e.g., x ordering if rot=0.
260
int primaryCmp(TextLine *line);
262
// Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
263
// based on a secondary-axis comparison of the baselines, e.g., y
264
// ordering if rot=0.
265
int secondaryCmp(TextLine *line);
267
int cmpYX(TextLine *line);
269
static int cmpXY(const void *p1, const void *p2);
271
void coalesce(UnicodeMap *uMap);
273
void visitSelection(TextSelectionVisitor *visitor,
274
PDFRectangle *selection,
275
SelectionStyle style);
277
// Get the head of the linked list of TextWords.
278
TextWord *getWords() { return words; }
280
// Get the next TextLine on the linked list.
281
TextLine *getNext() { return next; }
283
// Returns true if the last char of the line is a hyphen.
284
GBool isHyphenated() { return hyphenated; }
288
TextBlock *blk; // parent block
289
int rot; // text rotation
290
double xMin, xMax; // bounding box x coordinates
291
double yMin, yMax; // bounding box y coordinates
292
double base; // baseline x or y coordinate
293
TextWord *words; // words in this line
294
TextWord *lastWord; // last word in this line
295
Unicode *text; // Unicode text of the line, including
296
// spaces between words
297
double *edge; // "near" edge x or y coord of each char
298
// (plus one extra entry for the last char)
299
int *col; // starting column number of each Unicode char
300
int len; // number of Unicode chars
301
int convertedLen; // total number of converted characters
302
GBool hyphenated; // set if last char is a hyphen
303
TextLine *next; // next line in block
304
Unicode *normalized; // normalized form of Unicode text
305
int normalized_len; // number of normalized Unicode chars
306
int *normalized_idx; // indices of normalized chars into Unicode text
308
friend class TextLineFrag;
309
friend class TextBlock;
310
friend class TextFlow;
311
friend class TextWordList;
312
friend class TextPage;
314
friend class TextSelectionPainter;
315
friend class TextSelectionSizer;
316
friend class TextSelectionDumper;
319
//------------------------------------------------------------------------
321
//------------------------------------------------------------------------
326
TextBlock(TextPage *pageA, int rotA);
329
void addWord(TextWord *word);
331
void coalesce(UnicodeMap *uMap);
333
// Update this block's priMin and priMax values, looking at <blk>.
334
void updatePriMinMax(TextBlock *blk);
336
static int cmpXYPrimaryRot(const void *p1, const void *p2);
338
static int cmpYXPrimaryRot(const void *p1, const void *p2);
340
int primaryCmp(TextBlock *blk);
342
double secondaryDelta(TextBlock *blk);
344
// Returns true if <this> is below <blk>, relative to the page's
346
GBool isBelow(TextBlock *blk);
348
void visitSelection(TextSelectionVisitor *visitor,
349
PDFRectangle *selection,
350
SelectionStyle style);
352
// Get the head of the linked list of TextLines.
353
TextLine *getLines() { return lines; }
355
// Get the next TextBlock on the linked list.
356
TextBlock *getNext() { return next; }
358
void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
359
{ *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; }
361
int getLineCount() { return nLines; }
365
GBool isBeforeByRule1(TextBlock *blk1);
366
GBool isBeforeByRepeatedRule1(TextBlock *blkList, TextBlock *blk1);
367
GBool isBeforeByRule2(TextBlock *blk1);
369
int visitDepthFirst(TextBlock *blkList, int pos1,
370
TextBlock **sorted, int sortPos,
373
TextPage *page; // the parent page
374
int rot; // text rotation
375
double xMin, xMax; // bounding box x coordinates
376
double yMin, yMax; // bounding box y coordinates
377
double priMin, priMax; // whitespace bounding box along primary axis
378
double ExMin, ExMax; // extended bounding box x coordinates
379
double EyMin, EyMax; // extended bounding box y coordinates
380
int tableId; // id of table to which this block belongs
381
GBool tableEnd; // is this block at end of line of actual table
383
TextPool *pool; // pool of words (used only until lines
385
TextLine *lines; // linked list of lines
386
TextLine *curLine; // most recently added line
387
int nLines; // number of lines
388
int charCount; // number of characters in the block
389
int col; // starting column
390
int nColumns; // number of columns in the block
393
TextBlock *stackNext;
395
friend class TextLine;
396
friend class TextLineFrag;
397
friend class TextFlow;
398
friend class TextWordList;
399
friend class TextPage;
400
friend class TextSelectionPainter;
401
friend class TextSelectionDumper;
404
//------------------------------------------------------------------------
406
//------------------------------------------------------------------------
411
TextFlow(TextPage *pageA, TextBlock *blk);
414
// Add a block to the end of this flow.
415
void addBlock(TextBlock *blk);
417
// Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
418
// it uses a font no larger than the last block added to the flow,
419
// and (2) it fits within the flow's [priMin, priMax] along the
421
GBool blockFits(TextBlock *blk, TextBlock *prevBlk);
423
// Get the head of the linked list of TextBlocks.
424
TextBlock *getBlocks() { return blocks; }
426
// Get the next TextFlow on the linked list.
427
TextFlow *getNext() { return next; }
431
TextPage *page; // the parent page
432
double xMin, xMax; // bounding box x coordinates
433
double yMin, yMax; // bounding box y coordinates
434
double priMin, priMax; // whitespace bounding box along primary axis
435
TextBlock *blocks; // blocks in flow
436
TextBlock *lastBlk; // last block in this flow
439
friend class TextWordList;
440
friend class TextPage;
443
#if TEXTOUT_WORD_LIST
445
//------------------------------------------------------------------------
447
//------------------------------------------------------------------------
452
// Build a flat word list, in content stream order (if
453
// text->rawOrder is true), physical layout order (if <physLayout>
454
// is true and text->rawOrder is false), or reading order (if both
456
TextWordList(TextPage *text, GBool physLayout);
460
// Return the number of words on the list.
463
// Return the <idx>th word from the list.
464
TextWord *get(int idx);
468
GooList *words; // [TextWord]
471
#endif // TEXTOUT_WORD_LIST
473
//------------------------------------------------------------------------
475
//------------------------------------------------------------------------
481
TextPage(GBool rawOrderA);
487
void startPage(GfxState *state);
489
// End the current page.
492
// Update the current font.
493
void updateFont(GfxState *state);
496
void beginWord(GfxState *state, double x0, double y0);
498
// Add a character to the current word.
499
void addChar(GfxState *state, double x, double y,
500
double dx, double dy,
501
CharCode c, int nBytes, Unicode *u, int uLen);
503
// End the current word, sorting it into the list of words.
506
// Add a word, sorting it into the list of words.
507
void addWord(TextWord *word);
509
// Add a (potential) underline.
510
void addUnderline(double x0, double y0, double x1, double y1);
513
void addLink(int xMin, int yMin, int xMax, int yMax, Link *link);
515
// Coalesce strings that look like parts of the same line.
516
void coalesce(GBool physLayout, GBool doHTML);
518
// Find a string. If <startAtTop> is true, starts looking at the
519
// top of the page; else if <startAtLast> is true, starts looking
520
// immediately after the last find result; else starts looking at
521
// <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
522
// bottom of the page; else if <stopAtLast> is true, stops looking
523
// just before the last find result; else stops looking at
525
GBool findText(Unicode *s, int len,
526
GBool startAtTop, GBool stopAtBottom,
527
GBool startAtLast, GBool stopAtLast,
528
GBool caseSensitive, GBool backward,
529
double *xMin, double *yMin,
530
double *xMax, double *yMax);
532
// Get the text which is inside the specified rectangle.
533
GooString *getText(double xMin, double yMin,
534
double xMax, double yMax);
536
void visitSelection(TextSelectionVisitor *visitor,
537
PDFRectangle *selection,
538
SelectionStyle style);
540
void drawSelection(OutputDev *out,
543
PDFRectangle *selection,
544
SelectionStyle style,
545
GfxColor *glyph_color, GfxColor *box_color);
547
GooList *getSelectionRegion(PDFRectangle *selection,
548
SelectionStyle style,
551
GooString *getSelectionText(PDFRectangle *selection,
552
SelectionStyle style);
554
// Find a string by character position and length. If found, sets
555
// the text bounding rectangle and returns true; otherwise returns
557
GBool findCharRange(int pos, int length,
558
double *xMin, double *yMin,
559
double *xMax, double *yMax);
561
// Dump contents of page to a file.
562
void dump(void *outputStream, TextOutputFunc outputFunc,
565
// Get the head of the linked list of TextFlows.
566
TextFlow *getFlows() { return flows; }
568
#if TEXTOUT_WORD_LIST
569
// Build a flat word list, in content stream order (if
570
// this->rawOrder is true), physical layout order (if <physLayout>
571
// is true and this->rawOrder is false), or reading order (if both
573
TextWordList *makeWordList(GBool physLayout);
582
void assignColumns(TextLineFrag *frags, int nFrags, GBool rot);
583
int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s);
585
GBool rawOrder; // keep text in content stream order
587
double pageWidth, pageHeight; // width and height of current page
588
TextWord *curWord; // currently active string
589
int charPos; // next character position (within content
591
TextFontInfo *curFont; // current font
592
double curFontSize; // current font size
593
int nest; // current nesting level (for Type 3 fonts)
594
int nTinyChars; // number of "tiny" chars seen so far
595
GBool lastCharOverlap; // set if the last added char overlapped the
598
TextPool *pools[4]; // a "pool" of TextWords for each rotation
599
TextFlow *flows; // linked list of flows
600
TextBlock **blocks; // array of blocks, in yx order
601
int nBlocks; // number of blocks
602
int primaryRot; // primary rotation
603
GBool primaryLR; // primary direction (true means L-to-R,
604
// false means R-to-L)
605
TextWord *rawWords; // list of words, in raw order (only if
607
TextWord *rawLastWord; // last word on rawWords list
609
GooList *fonts; // all font info objects used on this
610
// page [TextFontInfo]
612
double lastFindXMin, // coordinates of the last "find" result
616
GooList *underlines; // [TextUnderline]
617
GooList *links; // [TextLink]
621
friend class TextLine;
622
friend class TextLineFrag;
623
friend class TextBlock;
624
friend class TextFlow;
625
friend class TextWordList;
626
friend class TextSelectionPainter;
627
friend class TextSelectionDumper;
630
//------------------------------------------------------------------------
632
//------------------------------------------------------------------------
636
// Create an ActualText
637
ActualText(TextPage *out);
640
void addChar(GfxState *state, double x, double y,
641
double dx, double dy,
642
CharCode c, int nBytes, Unicode *u, int uLen);
643
void beginMC(Dict *properties);
644
void endMC(GfxState *state);
648
int actualTextBMCLevel; // > 0 when inside ActualText span. Incremented
649
// for each nested BMC inside the span.
650
GooString *actualText; // replacement text for the span
651
GBool newActualTextSpan; // true at start of span. used to init the extent
652
double actualText_x, actualText_y; // extent of the text inside the span
653
double actualText_dx, actualText_dy;
657
//------------------------------------------------------------------------
659
//------------------------------------------------------------------------
661
class TextOutputDev: public OutputDev {
664
// Open a text output file. If <fileName> is NULL, no file is
665
// written (this is useful, e.g., for searching text). If
666
// <physLayoutA> is true, the original physical layout of the text
667
// is maintained. If <rawOrder> is true, the text is kept in
668
// content stream order.
669
TextOutputDev(char *fileName, GBool physLayoutA,
670
GBool rawOrderA, GBool append);
672
// Create a TextOutputDev which will write to a generic stream. If
673
// <physLayoutA> is true, the original physical layout of the text
674
// is maintained. If <rawOrder> is true, the text is kept in
675
// content stream order.
676
TextOutputDev(TextOutputFunc func, void *stream,
677
GBool physLayoutA, GBool rawOrderA);
680
virtual ~TextOutputDev();
682
// Check if file was successfully created.
683
virtual GBool isOk() { return ok; }
685
//---- get info about output device
687
// Does this device use upside-down coordinates?
688
// (Upside-down means (0,0) is the top left corner of the page.)
689
virtual GBool upsideDown() { return gTrue; }
691
// Does this device use drawChar() or drawString()?
692
virtual GBool useDrawChar() { return gTrue; }
694
// Does this device use beginType3Char/endType3Char? Otherwise,
695
// text in Type 3 fonts will be drawn with drawChar/drawString.
696
virtual GBool interpretType3Chars() { return gFalse; }
698
// Does this device need non-text content?
699
virtual GBool needNonText() { return gFalse; }
701
//----- initialization and control
704
virtual void startPage(int pageNum, GfxState *state);
707
virtual void endPage();
709
//----- update text state
710
virtual void updateFont(GfxState *state);
713
virtual void beginString(GfxState *state, GooString *s);
714
virtual void endString(GfxState *state);
715
virtual void drawChar(GfxState *state, double x, double y,
716
double dx, double dy,
717
double originX, double originY,
718
CharCode c, int nBytes, Unicode *u, int uLen);
720
//----- grouping operators
721
virtual void beginMarkedContent(char *name, Dict *properties);
722
virtual void endMarkedContent(GfxState *state);
724
//----- path painting
725
virtual void stroke(GfxState *state);
726
virtual void fill(GfxState *state);
727
virtual void eoFill(GfxState *state);
730
virtual void processLink(Link *link, Catalog *catalog);
732
//----- special access
734
// Find a string. If <startAtTop> is true, starts looking at the
735
// top of the page; else if <startAtLast> is true, starts looking
736
// immediately after the last find result; else starts looking at
737
// <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
738
// bottom of the page; else if <stopAtLast> is true, stops looking
739
// just before the last find result; else stops looking at
741
GBool findText(Unicode *s, int len,
742
GBool startAtTop, GBool stopAtBottom,
743
GBool startAtLast, GBool stopAtLast,
744
GBool caseSensitive, GBool backward,
745
double *xMin, double *yMin,
746
double *xMax, double *yMax);
748
// Get the text which is inside the specified rectangle.
749
GooString *getText(double xMin, double yMin,
750
double xMax, double yMax);
752
// Find a string by character position and length. If found, sets
753
// the text bounding rectangle and returns true; otherwise returns
755
GBool findCharRange(int pos, int length,
756
double *xMin, double *yMin,
757
double *xMax, double *yMax);
759
void drawSelection(OutputDev *out, double scale, int rotation,
760
PDFRectangle *selection,
761
SelectionStyle style,
762
GfxColor *glyph_color, GfxColor *box_color);
764
GooList *getSelectionRegion(PDFRectangle *selection,
765
SelectionStyle style,
768
GooString *getSelectionText(PDFRectangle *selection,
769
SelectionStyle style);
771
#if TEXTOUT_WORD_LIST
772
// Build a flat word list, in content stream order (if
773
// this->rawOrder is true), physical layout order (if
774
// this->physLayout is true and this->rawOrder is false), or reading
775
// order (if both flags are false).
776
TextWordList *makeWordList();
779
// Returns the TextPage object for the last rasterized page,
780
// transferring ownership to the caller.
781
TextPage *takeText();
783
// Turn extra processing for HTML conversion on or off.
784
void enableHTMLExtras(GBool doHTMLA) { doHTML = doHTMLA; }
788
TextOutputFunc outputFunc; // output function
789
void *outputStream; // output stream
790
GBool needClose; // need to close the output file?
791
// (only if outputStream is a FILE*)
792
TextPage *text; // text for the current page
793
GBool physLayout; // maintain original physical layout when
795
GBool rawOrder; // keep text in content stream order
796
GBool doHTML; // extra processing for HTML conversion
797
GBool ok; // set up ok?
799
ActualText *actualText;