1
//========================================================================
6
// Copyright 1999-2000 G. Ovtcharov
7
//========================================================================
9
//========================================================================
11
// Modified under the Poppler project - http://poppler.freedesktop.org
13
// All changes made under the Poppler project to this file are licensed
14
// under GPL version 2 or later
16
// Copyright (C) 2007-2008, 2010 Albert Astals Cid <aacid@kde.org>
17
// Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
18
// Copyright (C) 2010 Mike Slegeir <tehpola@yahoo.com>
19
// Copyright (C) 2010 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
20
// Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in)
22
// To see a description of the changes please see the Changelog file that
23
// came with your tarball or type make ChangeLog if you are building from git
25
//========================================================================
28
#include <poppler-config.h>
37
#include "parseargs.h"
38
#include "goo/GooString.h"
48
#include "PDFDocFactory.h"
49
#include "HtmlOutputDev.h"
51
#include "SplashOutputDev.h"
52
#include "splash/SplashBitmap.h"
54
#include "PSOutputDev.h"
55
#include "GlobalParams.h"
58
#include "goo/gfile.h"
61
# define GHOSTSCRIPT "gs"
64
static int firstPage = 1;
65
static int lastPage = 0;
66
static GBool rawOrder = gTrue;
67
GBool printCommands = gTrue;
68
static GBool printHelp = gFalse;
69
GBool printHtml = gFalse;
70
GBool complexMode=gFalse;
71
GBool singleHtml=gFalse; // singleHtml
73
static GBool useSplash=gTrue;
74
static char extension[5]="png";
75
static double scale=1.5;
76
GBool noframes=gFalse;
79
static GBool errQuiet=gFalse;
80
static GBool noDrm=gFalse;
82
GBool showHidden = gFalse;
83
GBool noMerge = gFalse;
84
static char ownerPassword[33] = "";
85
static char userPassword[33] = "";
86
static char gsDevice[33] = "none";
87
static GBool printVersion = gFalse;
89
static GooString* getInfoString(Dict *infoDict, char *key);
90
static GooString* getInfoDate(Dict *infoDict, char *key);
92
static char textEncName[128] = "";
94
static const ArgDesc argDesc[] = {
95
{"-f", argInt, &firstPage, 0,
96
"first page to convert"},
97
{"-l", argInt, &lastPage, 0,
98
"last page to convert"},
99
/*{"-raw", argFlag, &rawOrder, 0,
100
"keep strings in content stream order"},*/
101
{"-q", argFlag, &errQuiet, 0,
102
"don't print any messages or errors"},
103
{"-h", argFlag, &printHelp, 0,
104
"print usage information"},
105
{"-help", argFlag, &printHelp, 0,
106
"print usage information"},
107
{"-p", argFlag, &printHtml, 0,
108
"exchange .pdf links by .html"},
109
{"-c", argFlag, &complexMode, 0,
110
"generate complex document"},
111
{"-s", argFlag, &singleHtml, 0,
112
"generate single document that includes all pages"},
113
{"-i", argFlag, &ignore, 0,
115
{"-noframes", argFlag, &noframes, 0,
116
"generate no frames"},
117
{"-stdout" ,argFlag, &stout, 0,
118
"use standard output"},
119
{"-zoom", argFP, &scale, 0,
120
"zoom the pdf document (default 1.5)"},
121
{"-xml", argFlag, &xml, 0,
122
"output for XML post-processing"},
123
{"-hidden", argFlag, &showHidden, 0,
124
"output hidden text"},
125
{"-nomerge", argFlag, &noMerge, 0,
126
"do not merge paragraphs"},
127
{"-enc", argString, textEncName, sizeof(textEncName),
128
"output text encoding name"},
129
{"-dev", argString, gsDevice, sizeof(gsDevice),
130
"output device name for Ghostscript (png16m, jpeg etc)"},
131
{"-fmt", argString, extension, sizeof(extension),
132
"image file format for Splash output (png or jpg)"},
133
{"-v", argFlag, &printVersion, 0,
134
"print copyright and version info"},
135
{"-opw", argString, ownerPassword, sizeof(ownerPassword),
136
"owner password (for encrypted files)"},
137
{"-upw", argString, userPassword, sizeof(userPassword),
138
"user password (for encrypted files)"},
139
{"-nodrm", argFlag, &noDrm, 0,
140
"override document DRM settings"},
145
class SplashOutputDevNoText : public SplashOutputDev {
147
SplashOutputDevNoText(SplashColorMode colorModeA, int bitmapRowPadA,
148
GBool reverseVideoA, SplashColorPtr paperColorA,
149
GBool bitmapTopDownA = gTrue,
150
GBool allowAntialiasA = gTrue) : SplashOutputDev(colorModeA,
151
bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA,
153
virtual ~SplashOutputDevNoText() { }
155
void drawChar(GfxState *state, double x, double y,
156
double dx, double dy,
157
double originX, double originY,
158
CharCode code, int nBytes, Unicode *u, int uLen) { }
159
GBool beginType3Char(GfxState *state, double x, double y,
160
double dx, double dy,
161
CharCode code, Unicode *u, int uLen) { return false; }
162
void endType3Char(GfxState *state) { }
163
void beginTextObject(GfxState *state) { }
164
GBool deviceHasTextClip(GfxState *state) { return false; }
165
void endTextObject(GfxState *state) { }
166
GBool interpretType3Chars() { return gFalse; }
170
int main(int argc, char *argv[]) {
172
GooString *fileName = NULL;
173
GooString *docTitle = NULL;
174
GooString *author = NULL, *keywords = NULL, *subject = NULL, *date = NULL;
175
GooString *htmlFileName = NULL;
176
GooString *psFileName = NULL;
177
HtmlOutputDev *htmlOut = NULL;
179
SplashOutputDev *splashOut = NULL;
181
PSOutputDev *psOut = NULL;
184
GooString *ownerPW, *userPW;
186
char * extsList[] = {"png", "jpeg", "bmp", "pcx", "tiff", "pbm", NULL};
189
ok = parseArgs(argDesc, &argc, argv);
190
if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) {
191
fprintf(stderr, "pdftohtml version %s\n", PACKAGE_VERSION);
192
fprintf(stderr, "%s\n", popplerCopyright);
193
fprintf(stderr, "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
194
fprintf(stderr, "%s\n\n", xpdfCopyright);
196
printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc);
205
globalParams = new GlobalParams();
208
globalParams->setErrQuiet(errQuiet);
209
printCommands = gFalse; // I'm not 100% what is the differecne between them
212
if (textEncName[0]) {
213
globalParams->setTextEncoding(textEncName);
214
if( !globalParams->getTextEncoding() ) {
220
if (ownerPassword[0]) {
221
ownerPW = new GooString(ownerPassword);
225
if (userPassword[0]) {
226
userPW = new GooString(userPassword);
231
fileName = new GooString(argv[1]);
233
if (fileName->cmp("-") == 0) {
235
fileName = new GooString("fd://0");
238
doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW);
250
// check for copy permission
251
if (!doc->okToCopy()) {
253
error(-1, "Copying of text from this document is not allowed.");
256
fprintf(stderr, "Document has copy-protection bit set.\n");
259
// construct text file name
261
GooString* tmp = new GooString(argv[2]);
263
if (tmp->getLength() >= 5) {
264
p = tmp->getCString() + tmp->getLength() - 5;
265
if (!strcmp(p, ".html") || !strcmp(p, ".HTML")) {
266
htmlFileName = new GooString(tmp->getCString(), tmp->getLength() - 5);
270
if (tmp->getLength() >= 4) {
271
p = tmp->getCString() + tmp->getLength() - 4;
272
if (!strcmp(p, ".xml") || !strcmp(p, ".XML")) {
273
htmlFileName = new GooString(tmp->getCString(), tmp->getLength() - 4);
278
htmlFileName =new GooString(tmp);
281
} else if (fileName->cmp("fd://0") == 0) {
282
error(-1, "You have to provide an output filename when reading form stdin.");
285
p = fileName->getCString() + fileName->getLength() - 4;
286
if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF"))
287
htmlFileName = new GooString(fileName->getCString(),
288
fileName->getLength() - 4);
290
htmlFileName = fileName->copy();
291
// htmlFileName->append(".html");
294
if (scale>3.0) scale=3.0;
295
if (scale<0.5) scale=0.5;
297
if (complexMode || singleHtml) {
319
if (lastPage < 1 || lastPage > doc->getNumPages())
320
lastPage = doc->getNumPages();
322
doc->getDocInfo(&info);
324
docTitle = getInfoString(info.getDict(), "Title");
325
author = getInfoString(info.getDict(), "Author");
326
keywords = getInfoString(info.getDict(), "Keywords");
327
subject = getInfoString(info.getDict(), "Subject");
328
date = getInfoDate(info.getDict(), "ModDate");
330
date = getInfoDate(info.getDict(), "CreationDate");
333
if( !docTitle ) docTitle = new GooString(htmlFileName);
335
if( strcmp("none", gsDevice) ) {
337
/* determine extensions of output background images */
339
for(i = 0; extsList[i]; i++)
341
if( strstr(gsDevice, extsList[i]) != (char *) NULL )
343
strncpy(extension, extsList[i], sizeof(extension));
351
fprintf(stderr, "You are trying to use the -fmt option but your pdftohtml was built without support for it. Please use the -dev option\n");
366
rawOrder = complexMode; // todo: figure out what exactly rawOrder do :)
368
rawOrder = singleHtml;
371
htmlOut = new HtmlOutputDev(htmlFileName->getCString(),
372
docTitle->getCString(),
373
author ? author->getCString() : NULL,
374
keywords ? keywords->getCString() : NULL,
375
subject ? subject->getCString() : NULL,
376
date ? date->getCString() : NULL,
380
doc->getCatalog()->getOutline()->isDict());
401
doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0,
402
gTrue, gFalse, gFalse);
405
htmlOut->dumpDocOutline(doc->getCatalog());
409
if ((complexMode || singleHtml) && !xml && !ignore) {
412
GooString *imgFileName = NULL;
415
color[0] = color[1] = color[2] = 255;
416
// If the user specified "jpg" use JPEG, otherwise PNG
417
SplashImageFileFormat format = strcmp(extension, "jpg") ?
418
splashFormatPng : splashFormatJpeg;
420
splashOut = new SplashOutputDevNoText(splashModeRGB8, 4, gFalse, color);
421
splashOut->startDoc(doc->getXRef());
423
for (int pg = firstPage; pg <= lastPage; ++pg) {
424
doc->displayPage(splashOut, pg,
425
72 * scale, 72 * scale,
426
0, gTrue, gFalse, gFalse);
427
SplashBitmap *bitmap = splashOut->getBitmap();
429
imgFileName = GooString::format("{0:s}{1:03d}.{2:s}",
430
htmlFileName->getCString(), pg, extension);
432
bitmap->writeImgFile(format, imgFileName->getCString(),
433
72 * scale, 72 * scale);
441
int h=xoutRound(htmlOut->getPageHeight()/scale);
442
int w=xoutRound(htmlOut->getPageWidth()/scale);
443
//int h=xoutRound(doc->getPageHeight(1)/scale);
444
//int w=xoutRound(doc->getPageWidth(1)/scale);
446
psFileName = new GooString(htmlFileName->getCString());
447
psFileName->append(".ps");
449
psOut = new PSOutputDev(psFileName->getCString(), doc, doc->getXRef(),
450
doc->getCatalog(), NULL, firstPage, lastPage, psModePS, w, h);
451
psOut->setDisplayText(gFalse);
452
doc->displayPages(psOut, firstPage, lastPage, 72, 72, 0,
453
gTrue, gFalse, gFalse);
456
/*sprintf(buf, "%s -sDEVICE=png16m -dBATCH -dNOPROMPT -dNOPAUSE -r%d -sOutputFile=%s%%03d.png -g%dx%d -q %s", GHOSTSCRIPT, resolution, htmlFileName->getCString(), w, h,
457
psFileName->getCString());*/
459
GooString *gsCmd = new GooString(GHOSTSCRIPT);
460
GooString *tw, *th, *sc;
461
gsCmd->append(" -sDEVICE=");
462
gsCmd->append(gsDevice);
463
gsCmd->append(" -dBATCH -dNOPROMPT -dNOPAUSE -r");
464
sc = GooString::fromInt(static_cast<int>(72*scale));
466
gsCmd->append(" -sOutputFile=");
468
gsCmd->append(htmlFileName);
469
gsCmd->append("%03d.");
470
gsCmd->append(extension);
471
gsCmd->append("\" -g");
472
tw = GooString::fromInt(static_cast<int>(scale*w));
475
th = GooString::fromInt(static_cast<int>(scale*h));
476
th = GooString::fromInt(static_cast<int>(scale*h));
478
gsCmd->append(" -q \"");
479
gsCmd->append(psFileName);
481
// printf("running: %s\n", gsCmd->getCString());
482
if( !executeCommand(gsCmd->getCString()) && !errQuiet) {
483
error(-1, "Failed to launch Ghostscript!\n");
485
unlink(psFileName->getCString());
500
if(globalParams) delete globalParams;
502
if(htmlFileName) delete htmlFileName;
505
// check for memory leaks
506
Object::memCheck(stderr);
512
static GooString* getInfoString(Dict *infoDict, char *key) {
514
GooString *s1 = NULL;
516
if (infoDict->lookup(key, &obj)->isString()) {
517
s1 = new GooString(obj.getString());
523
static GooString* getInfoDate(Dict *infoDict, char *key) {
526
int year, mon, day, hour, min, sec, tz_hour, tz_minute;
529
GooString *result = NULL;
532
if (infoDict->lookup(key, &obj)->isString()) {
533
s = obj.getString()->getCString();
534
// TODO do something with the timezone info
535
if ( parseDateString( s, &year, &mon, &day, &hour, &min, &sec, &tz, &tz_hour, &tz_minute ) ) {
536
tmStruct.tm_year = year - 1900;
537
tmStruct.tm_mon = mon - 1;
538
tmStruct.tm_mday = day;
539
tmStruct.tm_hour = hour;
540
tmStruct.tm_min = min;
541
tmStruct.tm_sec = sec;
542
tmStruct.tm_wday = -1;
543
tmStruct.tm_yday = -1;
544
tmStruct.tm_isdst = -1;
545
mktime(&tmStruct); // compute the tm_wday and tm_yday fields
546
if (strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S+00:00", &tmStruct)) {
547
result = new GooString(buf);
549
result = new GooString(s);
552
result = new GooString(s);