1
//========================================================================
5
// Copyright 1997-2003 Glyph & Cog, LLC
7
//========================================================================
9
//========================================================================
11
// Modified under the Poppler project - http://poppler.freedesktop.org
13
// Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
14
// Copyright (C) 2006 Ed Catmur <ed@catmur.co.uk>
15
// Copyright (C) 2007-2008 Carlos Garcia Campos <carlosgc@gnome.org>
16
// Copyright (C) 2007 Adrian Johnson <ajohnson@redneon.com>
17
// Copyright (C) 2008 Albert Astals Cid <aacid@kde.org>
19
// To see a description of the changes please see the Changelog file that
20
// came with your tarball or type make ChangeLog if you are building from git
22
//========================================================================
24
#ifndef TEXTOUTPUTDEV_H
25
#define TEXTOUTPUTDEV_H
27
#ifdef USE_GCC_PRAGMAS
31
#include "poppler-config.h"
33
#include "goo/gtypes.h"
36
#include "OutputDev.h"
54
class TextSelectionVisitor;
56
//------------------------------------------------------------------------
58
typedef void (*TextOutputFunc)(void *stream, char *text, int len);
66
//------------------------------------------------------------------------
68
//------------------------------------------------------------------------
73
TextFontInfo(GfxState *state);
76
GBool matches(GfxState *state);
79
// Get the font name (which may be NULL).
80
GooString *getFontName() { return fontName; }
82
// Get font descriptor flags.
83
GBool isFixedWidth() { return flags & fontFixedWidth; }
84
GBool isSerif() { return flags & fontSerif; }
85
GBool isSymbolic() { return flags & fontSymbolic; }
86
GBool isItalic() { return flags & fontItalic; }
87
GBool isBold() { return flags & fontBold; }
98
friend class TextWord;
99
friend class TextPage;
100
friend class TextSelectionPainter;
103
//------------------------------------------------------------------------
105
//------------------------------------------------------------------------
111
TextWord(GfxState *state, int rotA, double x0, double y0,
112
int charPosA, TextFontInfo *fontA, double fontSize);
117
// Add a character to the word.
118
void addChar(GfxState *state, double x, double y,
119
double dx, double dy, CharCode c, Unicode u);
121
// Merge <word> onto the end of <this>.
122
void merge(TextWord *word);
124
// Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
125
// based on a primary-axis comparison, e.g., x ordering if rot=0.
126
int primaryCmp(TextWord *word);
128
// Return the distance along the primary axis between <this> and
130
double primaryDelta(TextWord *word);
132
static int cmpYX(const void *p1, const void *p2);
134
void visitSelection(TextSelectionVisitor *visitor,
135
PDFRectangle *selection,
136
SelectionStyle style);
138
// Get the TextFontInfo object associated with this word.
139
TextFontInfo *getFontInfo() { return font; }
141
// Get the next TextWord on the linked list.
142
TextWord *getNext() { return next; }
144
#if TEXTOUT_WORD_LIST
145
int getLength() { return len; }
146
const Unicode *getChar(int idx) { return &text[idx]; }
147
GooString *getText();
148
GooString *getFontName() { return font->fontName; }
149
void getColor(double *r, double *g, double *b)
150
{ *r = colorR; *g = colorG; *b = colorB; }
151
void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
152
{ *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; }
153
void getCharBBox(int charIdx, double *xMinA, double *yMinA,
154
double *xMaxA, double *yMaxA);
155
double getFontSize() { return fontSize; }
156
int getRotation() { return rot; }
157
int getCharPos() { return charPos; }
158
int getCharLen() { return charLen; }
159
GBool getSpaceAfter() { return spaceAfter; }
161
GBool isUnderlined() { return underlined; }
162
Link *getLink() { return link; }
163
double getEdge(int i) { return edge[i]; }
164
double getBaseline () { return base; }
165
GBool hasSpaceAfter () { return spaceAfter; }
166
TextWord* nextWord () { return next; };
169
int rot; // rotation, multiple of 90 degrees
171
double xMin, xMax; // bounding box x coordinates
172
double yMin, yMax; // bounding box y coordinates
173
double base; // baseline x or y coordinate
174
Unicode *text; // the text
175
CharCode *charcode; // glyph indices
176
double *edge; // "near" edge x or y coord of each char
177
// (plus one extra entry for the last char)
178
int len; // length of text and edge arrays
179
int size; // size of text and edge arrays
180
int charPos; // character position (within content stream)
181
int charLen; // number of content stream characters in
183
TextFontInfo *font; // font information
184
double fontSize; // font size
185
GBool spaceAfter; // set if there is a space between this
186
// word and the next word on the line
187
TextWord *next; // next word in line
189
#if TEXTOUT_WORD_LIST
190
double colorR, // word color
198
friend class TextPool;
199
friend class TextLine;
200
friend class TextBlock;
201
friend class TextFlow;
202
friend class TextWordList;
203
friend class TextPage;
205
friend class TextSelectionPainter;
206
friend class TextSelectionDumper;
209
//------------------------------------------------------------------------
211
//------------------------------------------------------------------------
219
TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; }
220
void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; }
222
int getBaseIdx(double base);
224
void addWord(TextWord *word);
228
int minBaseIdx; // min baseline bucket index
229
int maxBaseIdx; // max baseline bucket index
230
TextWord **pool; // array of linked lists, one for each
231
// baseline value (multiple of 4 pts)
232
TextWord *cursor; // pointer to last-accessed word
233
int cursorBaseIdx; // baseline bucket index of last-accessed word
235
friend class TextBlock;
236
friend class TextPage;
241
//------------------------------------------------------------------------
243
//------------------------------------------------------------------------
248
TextLine(TextBlock *blkA, int rotA, double baseA);
251
void addWord(TextWord *word);
253
// Return the distance along the primary axis between <this> and
255
double primaryDelta(TextLine *line);
257
// Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
258
// based on a primary-axis comparison, e.g., x ordering if rot=0.
259
int primaryCmp(TextLine *line);
261
// Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
262
// based on a secondary-axis comparison of the baselines, e.g., y
263
// ordering if rot=0.
264
int secondaryCmp(TextLine *line);
266
int cmpYX(TextLine *line);
268
static int cmpXY(const void *p1, const void *p2);
270
void coalesce(UnicodeMap *uMap);
272
void visitSelection(TextSelectionVisitor *visitor,
273
PDFRectangle *selection,
274
SelectionStyle style);
276
// Get the head of the linked list of TextWords.
277
TextWord *getWords() { return words; }
279
// Get the next TextLine on the linked list.
280
TextLine *getNext() { return next; }
282
// Returns true if the last char of the line is a hyphen.
283
GBool isHyphenated() { return hyphenated; }
287
TextBlock *blk; // parent block
288
int rot; // text rotation
289
double xMin, xMax; // bounding box x coordinates
290
double yMin, yMax; // bounding box y coordinates
291
double base; // baseline x or y coordinate
292
TextWord *words; // words in this line
293
TextWord *lastWord; // last word in this line
294
Unicode *text; // Unicode text of the line, including
295
// spaces between words
296
double *edge; // "near" edge x or y coord of each char
297
// (plus one extra entry for the last char)
298
int *col; // starting column number of each Unicode char
299
int len; // number of Unicode chars
300
int convertedLen; // total number of converted characters
301
GBool hyphenated; // set if last char is a hyphen
302
TextLine *next; // next line in block
303
Unicode *normalized; // normalized form of Unicode text
304
int normalized_len; // number of normalized Unicode chars
305
int *normalized_idx; // indices of normalized chars into Unicode text
307
friend class TextLineFrag;
308
friend class TextBlock;
309
friend class TextFlow;
310
friend class TextWordList;
311
friend class TextPage;
313
friend class TextSelectionPainter;
314
friend class TextSelectionSizer;
315
friend class TextSelectionDumper;
318
//------------------------------------------------------------------------
320
//------------------------------------------------------------------------
325
TextBlock(TextPage *pageA, int rotA);
328
void addWord(TextWord *word);
330
void coalesce(UnicodeMap *uMap);
332
// Update this block's priMin and priMax values, looking at <blk>.
333
void updatePriMinMax(TextBlock *blk);
335
static int cmpXYPrimaryRot(const void *p1, const void *p2);
337
static int cmpYXPrimaryRot(const void *p1, const void *p2);
339
int primaryCmp(TextBlock *blk);
341
double secondaryDelta(TextBlock *blk);
343
// Returns true if <this> is below <blk>, relative to the page's
345
GBool isBelow(TextBlock *blk);
347
void visitSelection(TextSelectionVisitor *visitor,
348
PDFRectangle *selection,
349
SelectionStyle style);
351
// Get the head of the linked list of TextLines.
352
TextLine *getLines() { return lines; }
354
// Get the next TextBlock on the linked list.
355
TextBlock *getNext() { return next; }
357
void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
358
{ *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; }
360
int getLineCount() { return nLines; }
364
TextPage *page; // the parent page
365
int rot; // text rotation
366
double xMin, xMax; // bounding box x coordinates
367
double yMin, yMax; // bounding box y coordinates
368
double priMin, priMax; // whitespace bounding box along primary axis
370
TextPool *pool; // pool of words (used only until lines
372
TextLine *lines; // linked list of lines
373
TextLine *curLine; // most recently added line
374
int nLines; // number of lines
375
int charCount; // number of characters in the block
376
int col; // starting column
377
int nColumns; // number of columns in the block
380
TextBlock *stackNext;
382
friend class TextLine;
383
friend class TextLineFrag;
384
friend class TextFlow;
385
friend class TextWordList;
386
friend class TextPage;
387
friend class TextSelectionPainter;
390
//------------------------------------------------------------------------
392
//------------------------------------------------------------------------
397
TextFlow(TextPage *pageA, TextBlock *blk);
400
// Add a block to the end of this flow.
401
void addBlock(TextBlock *blk);
403
// Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
404
// it uses a font no larger than the last block added to the flow,
405
// and (2) it fits within the flow's [priMin, priMax] along the
407
GBool blockFits(TextBlock *blk, TextBlock *prevBlk);
409
// Get the head of the linked list of TextBlocks.
410
TextBlock *getBlocks() { return blocks; }
412
// Get the next TextFlow on the linked list.
413
TextFlow *getNext() { return next; }
417
TextPage *page; // the parent page
418
double xMin, xMax; // bounding box x coordinates
419
double yMin, yMax; // bounding box y coordinates
420
double priMin, priMax; // whitespace bounding box along primary axis
421
TextBlock *blocks; // blocks in flow
422
TextBlock *lastBlk; // last block in this flow
425
friend class TextWordList;
426
friend class TextPage;
429
#if TEXTOUT_WORD_LIST
431
//------------------------------------------------------------------------
433
//------------------------------------------------------------------------
438
// Build a flat word list, in content stream order (if
439
// text->rawOrder is true), physical layout order (if <physLayout>
440
// is true and text->rawOrder is false), or reading order (if both
442
TextWordList(TextPage *text, GBool physLayout);
446
// Return the number of words on the list.
449
// Return the <idx>th word from the list.
450
TextWord *get(int idx);
454
GooList *words; // [TextWord]
457
#endif // TEXTOUT_WORD_LIST
459
//------------------------------------------------------------------------
461
//------------------------------------------------------------------------
467
TextPage(GBool rawOrderA);
473
void startPage(GfxState *state);
475
// End the current page.
478
// Update the current font.
479
void updateFont(GfxState *state);
482
void beginWord(GfxState *state, double x0, double y0);
484
// Add a character to the current word.
485
void addChar(GfxState *state, double x, double y,
486
double dx, double dy,
487
CharCode c, int nBytes, Unicode *u, int uLen);
489
// End the current word, sorting it into the list of words.
492
// Add a word, sorting it into the list of words.
493
void addWord(TextWord *word);
495
// Add a (potential) underline.
496
void addUnderline(double x0, double y0, double x1, double y1);
499
void addLink(int xMin, int yMin, int xMax, int yMax, Link *link);
501
// Coalesce strings that look like parts of the same line.
502
void coalesce(GBool physLayout, GBool doHTML);
504
// Find a string. If <startAtTop> is true, starts looking at the
505
// top of the page; else if <startAtLast> is true, starts looking
506
// immediately after the last find result; else starts looking at
507
// <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
508
// bottom of the page; else if <stopAtLast> is true, stops looking
509
// just before the last find result; else stops looking at
511
GBool findText(Unicode *s, int len,
512
GBool startAtTop, GBool stopAtBottom,
513
GBool startAtLast, GBool stopAtLast,
514
GBool caseSensitive, GBool backward,
515
double *xMin, double *yMin,
516
double *xMax, double *yMax);
518
// Get the text which is inside the specified rectangle.
519
GooString *getText(double xMin, double yMin,
520
double xMax, double yMax);
522
void visitSelection(TextSelectionVisitor *visitor,
523
PDFRectangle *selection,
524
SelectionStyle style);
526
void drawSelection(OutputDev *out,
529
PDFRectangle *selection,
530
SelectionStyle style,
531
GfxColor *glyph_color, GfxColor *box_color);
533
GooList *getSelectionRegion(PDFRectangle *selection,
534
SelectionStyle style,
537
GooString *getSelectionText(PDFRectangle *selection,
538
SelectionStyle style);
540
// Find a string by character position and length. If found, sets
541
// the text bounding rectangle and returns true; otherwise returns
543
GBool findCharRange(int pos, int length,
544
double *xMin, double *yMin,
545
double *xMax, double *yMax);
547
// Dump contents of page to a file.
548
void dump(void *outputStream, TextOutputFunc outputFunc,
551
// Get the head of the linked list of TextFlows.
552
TextFlow *getFlows() { return flows; }
554
#if TEXTOUT_WORD_LIST
555
// Build a flat word list, in content stream order (if
556
// this->rawOrder is true), physical layout order (if <physLayout>
557
// is true and this->rawOrder is false), or reading order (if both
559
TextWordList *makeWordList(GBool physLayout);
568
void assignColumns(TextLineFrag *frags, int nFrags, int rot);
569
int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s);
571
GBool rawOrder; // keep text in content stream order
573
double pageWidth, pageHeight; // width and height of current page
574
TextWord *curWord; // currently active string
575
int charPos; // next character position (within content
577
TextFontInfo *curFont; // current font
578
double curFontSize; // current font size
579
int nest; // current nesting level (for Type 3 fonts)
580
int nTinyChars; // number of "tiny" chars seen so far
581
GBool lastCharOverlap; // set if the last added char overlapped the
584
TextPool *pools[4]; // a "pool" of TextWords for each rotation
585
TextFlow *flows; // linked list of flows
586
TextBlock **blocks; // array of blocks, in yx order
587
int nBlocks; // number of blocks
588
int primaryRot; // primary rotation
589
GBool primaryLR; // primary direction (true means L-to-R,
590
// false means R-to-L)
591
TextWord *rawWords; // list of words, in raw order (only if
593
TextWord *rawLastWord; // last word on rawWords list
595
GooList *fonts; // all font info objects used on this
596
// page [TextFontInfo]
598
double lastFindXMin, // coordinates of the last "find" result
602
GooList *underlines; // [TextUnderline]
603
GooList *links; // [TextLink]
607
friend class TextLine;
608
friend class TextLineFrag;
609
friend class TextBlock;
610
friend class TextFlow;
611
friend class TextWordList;
612
friend class TextSelectionPainter;
613
friend class TextSelectionDumper;
616
//------------------------------------------------------------------------
618
//------------------------------------------------------------------------
622
// Create an ActualText
623
ActualText(TextPage *out);
626
void addChar(GfxState *state, double x, double y,
627
double dx, double dy,
628
CharCode c, int nBytes, Unicode *u, int uLen);
629
void beginMC(Dict *properties);
630
void endMC(GfxState *state);
634
int actualTextBMCLevel; // > 0 when inside ActualText span. Incremented
635
// for each nested BMC inside the span.
636
GooString *actualText; // replacement text for the span
637
GBool newActualTextSpan; // true at start of span. used to init the extent
638
double actualText_x, actualText_y; // extent of the text inside the span
639
double actualText_dx, actualText_dy;
643
//------------------------------------------------------------------------
645
//------------------------------------------------------------------------
647
class TextOutputDev: public OutputDev {
650
// Open a text output file. If <fileName> is NULL, no file is
651
// written (this is useful, e.g., for searching text). If
652
// <physLayoutA> is true, the original physical layout of the text
653
// is maintained. If <rawOrder> is true, the text is kept in
654
// content stream order.
655
TextOutputDev(char *fileName, GBool physLayoutA,
656
GBool rawOrderA, GBool append);
658
// Create a TextOutputDev which will write to a generic stream. If
659
// <physLayoutA> is true, the original physical layout of the text
660
// is maintained. If <rawOrder> is true, the text is kept in
661
// content stream order.
662
TextOutputDev(TextOutputFunc func, void *stream,
663
GBool physLayoutA, GBool rawOrderA);
666
virtual ~TextOutputDev();
668
// Check if file was successfully created.
669
virtual GBool isOk() { return ok; }
671
//---- get info about output device
673
// Does this device use upside-down coordinates?
674
// (Upside-down means (0,0) is the top left corner of the page.)
675
virtual GBool upsideDown() { return gTrue; }
677
// Does this device use drawChar() or drawString()?
678
virtual GBool useDrawChar() { return gTrue; }
680
// Does this device use beginType3Char/endType3Char? Otherwise,
681
// text in Type 3 fonts will be drawn with drawChar/drawString.
682
virtual GBool interpretType3Chars() { return gFalse; }
684
// Does this device need non-text content?
685
virtual GBool needNonText() { return gFalse; }
687
//----- initialization and control
690
virtual void startPage(int pageNum, GfxState *state);
693
virtual void endPage();
695
//----- update text state
696
virtual void updateFont(GfxState *state);
699
virtual void beginString(GfxState *state, GooString *s);
700
virtual void endString(GfxState *state);
701
virtual void drawChar(GfxState *state, double x, double y,
702
double dx, double dy,
703
double originX, double originY,
704
CharCode c, int nBytes, Unicode *u, int uLen);
706
//----- grouping operators
707
virtual void beginMarkedContent(char *name, Dict *properties);
708
virtual void endMarkedContent(GfxState *state);
710
//----- path painting
711
virtual void stroke(GfxState *state);
712
virtual void fill(GfxState *state);
713
virtual void eoFill(GfxState *state);
716
virtual void processLink(Link *link, Catalog *catalog);
718
//----- special access
720
// Find a string. If <startAtTop> is true, starts looking at the
721
// top of the page; else if <startAtLast> is true, starts looking
722
// immediately after the last find result; else starts looking at
723
// <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
724
// bottom of the page; else if <stopAtLast> is true, stops looking
725
// just before the last find result; else stops looking at
727
GBool findText(Unicode *s, int len,
728
GBool startAtTop, GBool stopAtBottom,
729
GBool startAtLast, GBool stopAtLast,
730
GBool caseSensitive, GBool backward,
731
double *xMin, double *yMin,
732
double *xMax, double *yMax);
734
// Get the text which is inside the specified rectangle.
735
GooString *getText(double xMin, double yMin,
736
double xMax, double yMax);
738
// Find a string by character position and length. If found, sets
739
// the text bounding rectangle and returns true; otherwise returns
741
GBool findCharRange(int pos, int length,
742
double *xMin, double *yMin,
743
double *xMax, double *yMax);
745
void drawSelection(OutputDev *out, double scale, int rotation,
746
PDFRectangle *selection,
747
SelectionStyle style,
748
GfxColor *glyph_color, GfxColor *box_color);
750
GooList *getSelectionRegion(PDFRectangle *selection,
751
SelectionStyle style,
754
GooString *getSelectionText(PDFRectangle *selection,
755
SelectionStyle style);
757
#if TEXTOUT_WORD_LIST
758
// Build a flat word list, in content stream order (if
759
// this->rawOrder is true), physical layout order (if
760
// this->physLayout is true and this->rawOrder is false), or reading
761
// order (if both flags are false).
762
TextWordList *makeWordList();
765
// Returns the TextPage object for the last rasterized page,
766
// transferring ownership to the caller.
767
TextPage *takeText();
769
// Turn extra processing for HTML conversion on or off.
770
void enableHTMLExtras(GBool doHTMLA) { doHTML = doHTMLA; }
774
TextOutputFunc outputFunc; // output function
775
void *outputStream; // output stream
776
GBool needClose; // need to close the output file?
777
// (only if outputStream is a FILE*)
778
TextPage *text; // text for the current page
779
GBool physLayout; // maintain original physical layout when
781
GBool rawOrder; // keep text in content stream order
782
GBool doHTML; // extra processing for HTML conversion
783
GBool ok; // set up ok?
785
ActualText *actualText;