48
50
#include "scanners.h"
54
#include "bytecode_api.h"
54
58
*Save the file being worked on in tmp */
57
static int try_flatedecode(unsigned char *buf, off_t real_len, off_t calculated_len, int fout, cli_ctx *ctx);
58
static int flatedecode(unsigned char *buf, off_t len, int fout, cli_ctx *ctx);
61
static int asciihexdecode(const char *buf, off_t len, char *output);
59
62
static int ascii85decode(const char *buf, off_t len, unsigned char *output);
60
63
static const char *pdf_nextlinestart(const char *ptr, size_t len);
61
64
static const char *pdf_nextobject(const char *ptr, size_t len);
67
static int xrefCheck(const char *xref, const char *eof)
70
while (xref < eof && (*xref == ' ' || *xref == '\n' || *xref == '\r'))
74
if (!memcmp(xref, "xref", 4)) {
75
cli_dbgmsg("cli_pdf: found xref\n");
78
/* could be xref stream */
79
for (q=xref; q+5 < eof; q++) {
80
if (!memcmp(q,"/XRef",4)) {
81
cli_dbgmsg("cli_pdf: found /XRef\n");
101
static const char *findNextNonWSBack(const char *q, const char *start)
104
(*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20))
111
static int find_stream_bounds(const char *start, off_t bytesleft, off_t bytesleft2, off_t *stream, off_t *endstream)
114
if ((q2 = cli_memstr(start, bytesleft, "stream", 6))) {
116
if (q2[0] == '\xd' && q2[1] == '\xa')
120
*stream = q2 - start;
121
bytesleft2 -= q2 - start;
125
q2 = cli_memstr(q, bytesleft2, "endstream", 9);
127
q2 = q + bytesleft2-9; /* till EOF */
128
*endstream = q2 - start;
129
if (*endstream < *stream)
130
*endstream = *stream;
136
static int pdf_findobj(struct pdf_struct *pdf)
138
const char *start, *q, *q2, *q3, *eof;
141
unsigned genid, objid;
144
pdf->objs = cli_realloc2(pdf->objs, sizeof(*pdf->objs)*pdf->nobjs);
146
cli_warnmsg("cli_pdf: out of memory parsing objects (%u)\n", pdf->nobjs);
149
obj = &pdf->objs[pdf->nobjs-1];
150
memset(obj, 0, sizeof(*obj));
151
start = pdf->map+pdf->offset;
152
bytesleft = pdf->size - pdf->offset;
153
while (bytesleft > 0) {
154
q2 = cli_memstr(start, bytesleft, "obj", 3);
156
return 0;/* no more objs */
158
bytesleft -= q2 - start;
159
if (*q2 != 0 && *q2 != 9 && *q2 != 0xa && *q2 != 0xc && *q2 != 0xd && *q2 != 0x20) {
169
q = findNextNonWSBack(q2-1, start);
170
while (q > start && isdigit(*q)) { q--; }
172
q = findNextNonWSBack(q-1,start);
173
while (q > start && isdigit(*q)) { q--; }
175
obj->id = (objid << 8) | (genid&0xff);
176
obj->start = q2+4 - pdf->map;
179
eof = pdf->map + pdf->size;
180
q = pdf->map + obj->start;
181
while (q < eof && bytesleft > 0) {
182
off_t p_stream, p_endstream;
183
q2 = pdf_nextobject(q, bytesleft);
185
q2 = pdf->map + pdf->size;
187
if (find_stream_bounds(q-1, q2-q, bytesleft + (q2-q), &p_stream, &p_endstream)) {
188
obj->flags |= 1 << OBJ_STREAM;
189
q2 = q-1 + p_endstream + 9;
190
bytesleft -= q2 - q + 1;
192
obj->flags |= 1 << OBJ_TRUNCATED;
193
pdf->offset = pdf->size;
194
return 1;/* truncated */
196
} else if ((q3 = cli_memstr(q-1, q2-q+1, "endobj", 6))) {
198
pdf->offset = q2 - pdf->map;
199
return 1; /* obj found and offset positioned */
206
obj->flags |= 1 << OBJ_TRUNCATED;
207
pdf->offset = pdf->size;
208
return 1;/* truncated */
211
static int filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj,
212
int fout, const char *buf, off_t len, off_t *sum)
214
if (cli_checklimits("pdf", pdf->ctx, *sum, 0, 0))
215
return len; /* pretend it was a successful write to suppress CL_EWRITE */
217
return cli_writen(fout, buf, len);
220
static void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag)
223
pdf->flags |= 1 << flag;
227
case UNTERMINATED_OBJ_DICT:
228
s = "dictionary not terminated";
230
case ESCAPED_COMMON_PDFNAME:
231
/* like /JavaScript */
232
s = "escaped common pdfname";
234
case BAD_STREAM_FILTERS:
235
s = "duplicate stream filters";
237
case BAD_PDF_VERSION:
238
s = "bad pdf version";
240
case BAD_PDF_HEADERPOS:
241
s = "bad pdf header position";
243
case BAD_PDF_TRAILER:
244
s = "bad pdf trailer";
246
case BAD_PDF_TOOMANYOBJS:
247
s = "too many pdf objs";
250
s = "bad deflate stream";
253
s = "bad deflate stream start";
255
case BAD_STREAMSTART:
256
s = "bad stream start";
259
s = "unknown filter used";
261
case BAD_ASCIIDECODE:
262
s = "bad ASCII decode";
265
s = "hex javascript";
268
s = "referencing nonexistent obj";
271
s = "has /OpenAction";
274
s = "bad /Length, too small";
277
s = "PDF is encrypted";
280
s = "linearized PDF";
283
s = "more than 2 filters per obj";
286
cli_dbgmsg("cli_pdf: %s flagged in object %u %u\n", s, obj->id>>8, obj->id&0xff);
289
static int filter_flatedecode(struct pdf_struct *pdf, struct pdf_obj *obj,
290
const char *buf, off_t len, int fout, off_t *sum)
304
pdfobj_flag(pdf, obj, BAD_STREAMSTART);
305
/* PDF spec says stream is followed by \r\n or \n, but not \r alone.
306
* Sample 0015315109, it has \r followed by zlib header.
307
* Flag pdf as suspicious, and attempt to extract by skipping the \r.
313
memset(&stream, 0, sizeof(stream));
314
stream.next_in = (Bytef *)buf;
315
stream.avail_in = len;
316
stream.next_out = (Bytef *)output;
317
stream.avail_out = sizeof(output);
319
zstat = inflateInit(&stream);
321
cli_warnmsg("cli_pdf: inflateInit failed\n");
326
while(stream.avail_in) {
328
zstat = inflate(&stream, Z_NO_FLUSH); /* zlib */
331
if(stream.avail_out == 0) {
332
if ((written=filter_writen(pdf, obj, fout, output, sizeof(output), sum))!=sizeof(output)) {
333
cli_errmsg("cli_pdf: failed to write output file\n");
338
stream.next_out = (Bytef *)output;
339
stream.avail_out = sizeof(output);
344
written = sizeof(output) - stream.avail_out;
345
if (!written && !nbytes && !skipped) {
346
/* skip till EOL, and try inflating from there, sometimes
347
* PDFs contain extra whitespace */
348
const char *q = pdf_nextlinestart(buf, len);
354
stream.next_in = (Bytef *)buf;
355
stream.avail_in = len;
356
stream.next_out = (Bytef *)output;
357
stream.avail_out = sizeof(output);
358
zstat = inflateInit(&stream);
360
cli_warnmsg("cli_pdf: inflateInit failed\n");
363
pdfobj_flag(pdf, obj, BAD_FLATESTART);
368
if (filter_writen(pdf, obj, fout, output, written, sum)!=written) {
369
cli_errmsg("cli_pdf: failed to write output file\n");
374
stream.next_out = (Bytef *)output;
375
stream.avail_out = sizeof(output);
376
if (zstat == Z_STREAM_END)
380
cli_dbgmsg("cli_pdf: after writing %lu bytes, got error \"%s\" inflating PDF stream in %u %u obj\n",
381
(unsigned long)nbytes,
382
stream.msg, obj->id>>8, obj->id&0xff);
384
cli_dbgmsg("cli_pdf: after writing %lu bytes, got error %d inflating PDF stream in %u %u obj\n",
385
(unsigned long)nbytes, zstat, obj->id>>8, obj->id&0xff);
386
/* mark stream as bad only if not encrypted */
389
cli_dbgmsg("cli_pdf: dumping raw stream (probably encrypted)\n");
390
if (filter_writen(pdf, obj, fout, buf, len, sum) != len) {
391
cli_errmsg("cli_pdf: failed to write output file\n");
394
pdfobj_flag(pdf, obj, BAD_FLATESTART);
396
pdfobj_flag(pdf, obj, BAD_FLATE);
403
if(stream.avail_out != sizeof(output)) {
404
if(filter_writen(pdf, obj, fout, output, sizeof(output) - stream.avail_out, sum) < 0) {
405
cli_errmsg("cli_pdf: failed to write output file\n");
415
static struct pdf_obj *find_obj(struct pdf_struct *pdf,
416
struct pdf_obj *obj, uint32_t objid)
421
/* search starting at previous obj (if exists) */
422
if (obj != pdf->objs)
426
for (j=i;j<pdf->nobjs;j++) {
428
if (obj->id == objid)
431
/* restart search from beginning if not found */
434
if (obj->id == objid)
440
static int find_length(struct pdf_struct *pdf,
442
const char *start, off_t len)
446
q = cli_memstr(start, len, "/Length", 7);
451
start = pdf_nextobject(q, len);
454
/* len -= start - q; */
457
while (isdigit(*q)) q++;
462
while(isdigit(*q)) q++;
463
if (q[0] == ' ' && q[1] == 'R') {
464
cli_dbgmsg("cli_pdf: length is in indirect object %u %u\n", length, genid);
465
obj = find_obj(pdf, obj, (length << 8) | (genid&0xff));
467
cli_dbgmsg("cli_pdf: indirect object not found\n");
470
q = pdf_nextobject(pdf->map+obj->start, pdf->size - obj->start);
475
if (start - pdf->map + length+5 > pdf->size) {
476
length = pdf->size - (start - pdf->map)-5;
481
#define DUMP_MASK ((1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION))
483
static int obj_size(struct pdf_struct *pdf, struct pdf_obj *obj, int binary)
485
unsigned i = obj - pdf->objs;
487
if (i < pdf->nobjs) {
488
int s = pdf->objs[i].start - obj->start - 4;
491
const char *p = pdf->map + obj->start;
492
const char *q = p + s;
493
while (q > p && (isspace(*q) || isdigit(*q)))
495
if (q > p+5 && !memcmp(q-5,"endobj",6))
497
q = findNextNonWSBack(q, p);
505
return pdf->size - obj->start;
506
return pdf->offset - obj->start - 6;
509
static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd,
513
struct cli_bc_ctx *bc_ctx;
514
cli_ctx *ctx = pdf->ctx;
517
bc_ctx = cli_bytecode_context_alloc();
519
cli_errmsg("cli_pdf: can't allocate memory for bc_ctx");
525
map = fmap(fd, 0, 0);
527
cli_warnmsg("can't mmap pdf extracted obj\n");
532
cli_bytecode_context_setpdf(bc_ctx, phase, pdf->nobjs, pdf->objs,
533
&pdf->flags, pdf->size, pdf->startoff);
534
cli_bytecode_context_setctx(bc_ctx, ctx);
535
ret = cli_bytecode_runhook(ctx, ctx->engine, bc_ctx, BC_PDF, map, ctx->virname);
536
cli_bytecode_context_destroy(bc_ctx);
543
static int pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj)
545
char fullname[NAME_MAX + 1];
549
char *ascii_decoded = NULL;
552
/* TODO: call bytecode hook here, allow override dumpability */
553
if ((!(obj->flags & (1 << OBJ_STREAM)) ||
554
(obj->flags & (1 << OBJ_HASFILTERS)))
555
&& !(obj->flags & DUMP_MASK)) {
556
/* don't dump all streams */
559
if ((obj->flags & (1 << OBJ_IMAGE)) &&
560
!(obj->flags & (1 << OBJ_FILTER_DCT))) {
561
/* don't dump / scan non-JPG images */
564
if (obj->flags & (1 << OBJ_FORCEDUMP)) {
565
/* bytecode can force dump by setting this flag */
570
cli_dbgmsg("cli_pdf: dumping obj %u %u\n", obj->id>>8, obj->id);
571
snprintf(fullname, sizeof(fullname), "%s"PATHSEP"pdf%02u", pdf->dir, pdf->files++);
572
fout = open(fullname,O_RDWR|O_CREAT|O_EXCL|O_TRUNC|O_BINARY, 0600);
575
cli_errmsg("cli_pdf: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
581
if (obj->flags & (1 << OBJ_STREAM)) {
582
const char *start = pdf->map + obj->start;
583
off_t p_stream = 0, p_endstream = 0;
585
find_stream_bounds(start, pdf->size - obj->start,
586
pdf->size - obj->start,
587
&p_stream, &p_endstream);
588
if (p_stream && p_endstream) {
589
const char *flate_in;
590
long ascii_decoded_size = 0;
591
size_t size = p_endstream - p_stream;
594
length = find_length(pdf, obj, start, p_stream);
597
orig_length = length;
598
if (length > pdf->size || obj->start + p_stream + length > pdf->size) {
599
cli_dbgmsg("cli_pdf: length out of file: %ld + %ld > %ld\n",
600
p_stream, length, pdf->size);
601
length = pdf->size - (obj->start + p_stream);
603
if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && length <= 0) {
604
const char *q = start + p_endstream;
612
} else if (*q == '\r') {
617
cli_dbgmsg("cli_pdf: calculated length %ld\n", length);
619
if (size > length+2) {
620
cli_dbgmsg("cli_pdf: calculated length %ld < %ld\n",
625
if (orig_length && size > orig_length + 20) {
626
cli_dbgmsg("cli_pdf: orig length: %ld, length: %ld, size: %ld\n",
627
orig_length, length, size);
628
pdfobj_flag(pdf, obj, BAD_STREAMLEN);
633
if (obj->flags & (1 << OBJ_FILTER_AH)) {
634
ascii_decoded = cli_malloc(length/2 + 1);
635
if (!ascii_decoded) {
636
cli_errmsg("Cannot allocate memory for asciidecode\n");
640
ascii_decoded_size = asciihexdecode(start + p_stream,
643
} else if (obj->flags & (1 << OBJ_FILTER_A85)) {
644
ascii_decoded = cli_malloc(length*5);
645
if (!ascii_decoded) {
646
cli_errmsg("Cannot allocate memory for asciidecode\n");
650
ascii_decoded_size = ascii85decode(start+p_stream,
652
(unsigned char*)ascii_decoded);
654
if (ascii_decoded_size < 0) {
655
/* don't flag for images or truncated objs*/
657
((1 << OBJ_IMAGE) | (1 << OBJ_TRUNCATED))))
658
pdfobj_flag(pdf, obj, BAD_ASCIIDECODE);
659
cli_dbgmsg("cli_pdf: failed to asciidecode in %u %u obj\n", obj->id>>8,obj->id&0xff);
661
ascii_decoded = NULL;
662
/* attempt to directly flatedecode it */
664
/* either direct or ascii-decoded input */
666
ascii_decoded_size = length;
667
flate_in = ascii_decoded ? ascii_decoded : start+p_stream;
669
if (obj->flags & (1 << OBJ_FILTER_FLATE)) {
670
cli_dbgmsg("cli_pdf: deflate len %ld (orig %ld)\n", ascii_decoded_size, (long)orig_length);
671
rc = filter_flatedecode(pdf, obj, flate_in, ascii_decoded_size, fout, &sum);
673
if (filter_writen(pdf, obj, fout, flate_in, ascii_decoded_size, &sum) != ascii_decoded_size)
677
} else if (obj->flags & (1 << OBJ_JAVASCRIPT)) {
679
const char *q = pdf->map+obj->start;
680
/* TODO: get obj-endobj size */
681
off_t bytesleft = obj_size(pdf, obj, 0);
685
q2 = cli_memstr(q, bytesleft, "/JavaScript", 11);
692
q = pdf_nextobject(q2, bytesleft);
701
if (filter_writen(pdf, obj, fout, q+1, bytesleft-1, &sum) != (bytesleft-1)) {
705
} else if (*q == '<') {
707
q2 = memchr(q+1, '>', bytesleft);
708
if (!q2) q2 = q + bytesleft;
709
decoded = cli_malloc(q2 - q);
714
cli_hex2str_to(q2, decoded, q2-q-1);
715
decoded[q2-q-1] = '\0';
716
cli_dbgmsg("cli_pdf: found hexadecimal encoded javascript in %u %u obj\n",
717
obj->id>>8, obj->id&0xff);
718
pdfobj_flag(pdf, obj, HEX_JAVASCRIPT);
719
filter_writen(pdf, obj, fout, decoded, q2-q-1, &sum);
723
off_t bytesleft = obj_size(pdf, obj, 0);
724
if (filter_writen(pdf, obj, fout , pdf->map + obj->start, bytesleft,&sum) != bytesleft)
728
cli_dbgmsg("cli_pdf: extracted %ld bytes %u %u obj to %s\n", sum, obj->id>>8, obj->id&0xff, fullname);
731
cli_updatelimits(pdf->ctx, sum);
732
/* TODO: invoke bytecode on this pdf obj with metainformation associated
734
lseek(fout, 0, SEEK_SET);
735
rc2 = cli_magic_scandesc(fout, pdf->ctx);
736
if (rc2 == CL_VIRUS || rc == CL_SUCCESS)
738
if (rc == CL_CLEAN) {
739
rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout, obj - pdf->objs);
746
if (!pdf->ctx->engine->keeptmp)
747
if (cli_unlink(fullname) && rc != CL_VIRUS)
759
STATE_ANY /* for actions table below */
762
struct pdfname_action {
764
enum pdf_objflags set_objflag;/* OBJ_DICT is noop */
765
enum objstate from_state;/* STATE_NONE is noop */
766
enum objstate to_state;
769
static struct pdfname_action pdfname_actions[] = {
770
{"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER},
771
{"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER},
772
{"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER},
773
{"AHx", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER},
774
{"EmbeddedFile", OBJ_EMBEDDED_FILE, STATE_NONE, STATE_NONE},
775
{"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER},
776
{"Fl", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER},
777
{"Image", OBJ_IMAGE, STATE_NONE, STATE_NONE},
778
{"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER},
779
{"LZW", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER},
780
{"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER},
781
{"RL", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER},
782
{"CCITTFaxDecode", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER},
783
{"CCF", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER},
784
{"JBIG2Decode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER},
785
{"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER},
786
{"DCT", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER},
787
{"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER},
788
{"Crypt", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE},
789
{"Standard", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_FILTER},
790
{"Sig", OBJ_SIGNED, STATE_ANY, STATE_NONE},
791
{"V", OBJ_SIGNED, STATE_ANY, STATE_NONE},
792
{"R", OBJ_SIGNED, STATE_ANY, STATE_NONE},
793
{"Linearized", OBJ_DICT, STATE_NONE, STATE_LINEARIZED},
794
{"Filter", OBJ_HASFILTERS, STATE_ANY, STATE_FILTER},
795
{"JavaScript", OBJ_JAVASCRIPT, STATE_S, STATE_JAVASCRIPT},
796
{"Length", OBJ_DICT, STATE_FILTER, STATE_NONE},
797
{"S", OBJ_DICT, STATE_NONE, STATE_S},
798
{"Type", OBJ_DICT, STATE_NONE, STATE_NONE},
799
{"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION}
802
#define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT))
804
static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj,
805
const char *pdfname, int escapes,
806
enum objstate *state)
808
struct pdfname_action *act = NULL;
810
for (j=0;j<sizeof(pdfname_actions)/sizeof(pdfname_actions[0]);j++) {
811
if (!strcmp(pdfname, pdfname_actions[j].pdfname)) {
812
act = &pdfname_actions[j];
817
if (*state == STATE_FILTER &&
818
!(obj->flags & (1 << OBJ_SIGNED)) &&
819
/* these are digital signature objects, filter doesn't matter,
820
* we don't need them anyway */
821
!(obj->flags & KNOWN_FILTERS)) {
822
cli_dbgmsg("cli_pdf: unknown filter %s\n", pdfname);
823
obj->flags |= 1 << OBJ_FILTER_UNKNOWN;
828
/* if a commonly used PDF name is escaped that is certainly
830
cli_dbgmsg("cli_pdf: pdfname %s is escaped\n", pdfname);
831
pdfobj_flag(pdf, obj, ESCAPED_COMMON_PDFNAME);
833
if (act->from_state == *state ||
834
act->from_state == STATE_ANY) {
835
*state = act->to_state;
837
if (*state == STATE_FILTER &&
838
act->set_objflag !=OBJ_DICT &&
839
(obj->flags & (1 << act->set_objflag))) {
840
cli_dbgmsg("cli_pdf: duplicate stream filter %s\n", pdfname);
841
pdfobj_flag(pdf, obj, BAD_STREAM_FILTERS);
843
obj->flags |= 1 << act->set_objflag;
845
/* auto-reset states */
856
static void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
858
/* enough to hold common pdf names, we don't need all the names */
861
const char *q = obj->start + pdf->map;
862
const char *dict, *start;
864
off_t bytesleft = obj_size(pdf, obj, 1);
865
unsigned i, filters=0;
866
enum objstate objstate = STATE_NONE;
871
/* find start of dictionary */
873
q2 = pdf_nextobject(q, bytesleft);
875
if (!q2 || bytesleft < 0) {
878
q3 = memchr(q-1, '<', q2-q+1);
882
} while (!q3 || q3[1] != '<');
885
bytesleft = obj_size(pdf, obj, 1) - (q - start);
886
/* find end of dictionary */
888
q2 = pdf_nextobject(q, bytesleft);
890
if (!q2 || bytesleft < 0) {
893
q3 = memchr(q-1, '>', q2-q+1);
897
} while (!q3 || q3[1] != '>');
898
obj->flags |= 1 << OBJ_DICT;
899
dict_length = q3 - dict;
901
/* process pdf names */
902
for (q = dict;dict_length;) {
904
q2 = memchr(q, '/', dict_length);
907
dict_length -= q2 - q;
909
/* normalize PDF names */
910
for (i = 0;dict_length && (i < sizeof(pdfname)-1); i++) {
914
cli_hex2str_to(q+1, pdfname+i, 2);
920
if (*q == ' ' || *q == '\t' || *q == '\r' || *q == '\n' ||
921
*q == '/' || *q == '>' || *q == ']' || *q == '[' || *q == '<')
927
handle_pdfname(pdf, obj, pdfname, escapes, &objstate);
928
if (objstate == STATE_LINEARIZED) {
929
pdfobj_flag(pdf, obj, LINEARIZED_PDF);
930
objstate = STATE_NONE;
932
if (objstate == STATE_JAVASCRIPT ||
933
objstate == STATE_OPENACTION) {
934
if (objstate == STATE_OPENACTION)
935
pdfobj_flag(pdf, obj, HAS_OPENACTION);
936
q2 = pdf_nextobject(q, dict_length);
937
if (q2 && isdigit(*q2)) {
938
uint32_t objid = atoi(q2) << 8;
939
while (isdigit(*q2)) q2++;
940
q2 = pdf_nextobject(q2, dict_length);
941
if (q2 && isdigit(*q2)) {
942
objid |= atoi(q2) & 0xff;
943
q2 = pdf_nextobject(q2, dict_length);
945
struct pdf_obj *obj2;
946
cli_dbgmsg("cli_pdf: found %s stored in indirect object %u %u\n",
948
objid >> 8, objid&0xff);
949
obj2 = find_obj(pdf, obj, objid);
951
enum pdf_objflags flag = objstate == STATE_JAVASCRIPT ?
952
OBJ_JAVASCRIPT : OBJ_OPENACTION;
953
obj2->flags |= 1 << flag;
954
obj->flags &= ~(1 << flag);
956
pdfobj_flag(pdf, obj, BAD_INDOBJ);
961
objstate = STATE_NONE;
964
for (i=0;i<sizeof(pdfname_actions)/sizeof(pdfname_actions[0]);i++) {
965
const struct pdfname_action *act = &pdfname_actions[i];
966
if ((obj->flags & (1 << act->set_objflag)) &&
967
act->from_state == STATE_FILTER &&
968
act->to_state == STATE_FILTER &&
969
act->set_objflag != OBJ_FILTER_CRYPT) {
973
if (filters > 2) { /* more than 2 non-crypt filters */
974
pdfobj_flag(pdf, obj, MANY_FILTERS);
976
if (obj->flags & ((1 << OBJ_SIGNED) | KNOWN_FILTERS))
977
obj->flags &= ~(1 << OBJ_FILTER_UNKNOWN);
978
if (obj->flags & (1 << OBJ_FILTER_UNKNOWN))
979
pdfobj_flag(pdf, obj, UNKNOWN_FILTER);
980
cli_dbgmsg("cli_pdf: %u %u obj flags: %02x\n", obj->id>>8, obj->id&0xff, obj->flags);
983
int cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
985
struct pdf_struct pdf;
986
fmap_t *map = *ctx->fmap;
987
size_t size = map->len - offset;
988
off_t versize = size > 1032 ? 1032 : size;
989
off_t map_off, bytesleft;
991
const char *pdfver, *start, *eofmap, *q, *eof;
995
cli_dbgmsg("in cli_pdf(%s)\n", dir);
996
memset(&pdf, 0, sizeof(pdf));
1000
pdfver = start = fmap_need_off_once(map, offset, versize);
1002
/* Check PDF version */
1004
cli_errmsg("cli_pdf: mmap() failed (1)\n");
1007
/* offset is 0 when coming from filetype2 */
1008
pdfver = cli_memstr(pdfver, versize, "%PDF-", 5);
1010
cli_dbgmsg("cli_pdf: no PDF- header found\n");
1013
/* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future
1015
if (pdfver[5] != '1' || pdfver[6] != '.' ||
1016
pdfver[7] < '1' || pdfver[7] > '9') {
1017
pdf.flags |= 1 << BAD_PDF_VERSION;
1018
cli_dbgmsg("cli_pdf: bad pdf version: %.8s\n", pdfver);
1020
if (pdfver != start || offset) {
1021
pdf.flags |= 1 << BAD_PDF_HEADERPOS;
1022
cli_dbgmsg("cli_pdf: PDF header is not at position 0: %ld\n",pdfver-start+offset);
1024
offset += pdfver - start;
1026
/* find trailer and xref, don't fail if not found */
1027
map_off = map->len - 2048;
1030
bytesleft = map->len - map_off;
1031
eofmap = fmap_need_off_once(map, map_off, bytesleft);
1033
cli_errmsg("cli_pdf: mmap() failed (2)\n");
1036
eof = eofmap + bytesleft;
1037
for (q=&eofmap[bytesleft-5]; q > eofmap; q--) {
1038
if (memcmp(q, "%%EOF", 5) == 0)
1042
pdf.flags |= 1 << BAD_PDF_TRAILER;
1043
cli_dbgmsg("cli_pdf: %%%%EOF not found\n");
1046
size = q - eofmap + map_off;
1047
for (;q > eofmap;q--) {
1048
if (memcmp(q, "startxref", 9) == 0)
1052
pdf.flags |= 1 << BAD_PDF_TRAILER;
1053
cli_dbgmsg("cli_pdf: startxref not found\n");
1055
for (t=q;t > eofmap; t--) {
1056
if (memcmp(t,"trailer",7) == 0)
1060
if (cli_memstr(t, q-t, "/Encrypt", 8)) {
1061
pdf.flags |= 1 << ENCRYPTED_PDF;
1062
cli_dbgmsg("cli_pdf: encrypted pdf found, stream will probably fail to decompress!\n");
1066
while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) { q++; }
1068
bytesleft = map->len - offset - xref;
1069
if (bytesleft > 4096)
1071
q = fmap_need_off_once(map, offset + xref, bytesleft);
1072
if (!q || xrefCheck(q, q+bytesleft) == -1) {
1073
cli_dbgmsg("cli_pdf: did not find valid xref\n");
1074
pdf.flags |= 1 << BAD_PDF_TRAILER;
1081
pdf.map = fmap_need_off(map, offset, size);
1082
pdf.startoff = offset;
1084
cli_errmsg("cli_pdf: mmap() failed (3)\n");
1087
rc = run_pdf_hooks(&pdf, PDF_PHASE_PRE, -1, -1);
1089
cli_dbgmsg("cli_pdf: returning %d\n", rc);
1092
/* parse PDF and find obj offsets */
1093
while ((rc = pdf_findobj(&pdf)) > 0) {
1094
struct pdf_obj *obj = &pdf.objs[pdf.nobjs-1];
1095
cli_dbgmsg("found %d %d obj @%ld\n", obj->id >> 8, obj->id&0xff, obj->start + offset);
1100
pdf.flags |= 1 << BAD_PDF_TOOMANYOBJS;
1102
/* must parse after finding all objs, so we can flag indirect objects */
1103
for (i=0;i<pdf.nobjs;i++) {
1104
struct pdf_obj *obj = &pdf.objs[i];
1105
pdf_parseobj(&pdf, obj);
1108
rc = run_pdf_hooks(&pdf, PDF_PHASE_PARSED, -1, -1);
1109
/* extract PDF objs */
1110
for (i=0;!rc && i<pdf.nobjs;i++) {
1111
struct pdf_obj *obj = &pdf.objs[i];
1112
rc = pdf_extract_obj(&pdf, obj);
1115
if (pdf.flags & (1 << ENCRYPTED_PDF))
1116
pdf.flags &= ~ ((1 << BAD_FLATESTART) | (1 << BAD_STREAMSTART) |
1117
(1 << BAD_ASCIIDECODE));
1119
if (pdf.flags && !rc) {
1120
cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags);
1121
rc = run_pdf_hooks(&pdf, PDF_PHASE_END, -1, -1);
1122
if (!rc && (ctx->options & CL_SCAN_ALGORITHMIC)) {
1123
if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) {
1124
/* for example /Fl#61te#44#65#63#6f#64#65 instead of /FlateDecode */
1125
*ctx->virname = "Heuristics.PDF.ObfuscatedNameObject";
1126
rc = cli_found_possibly_unwanted(ctx);
1130
/* TODO: find both trailers, and /Encrypt settings */
1131
if (pdf.flags & (1 << LINEARIZED_PDF))
1132
pdf.flags &= ~ (1 << BAD_ASCIIDECODE);
1133
if (pdf.flags & (1 << MANY_FILTERS))
1134
pdf.flags &= ~ (1 << BAD_ASCIIDECODE);
1135
if (!rc && (pdf.flags &
1136
((1 << BAD_PDF_TOOMANYOBJS) | (1 << BAD_STREAM_FILTERS) |
1137
(1<<BAD_FLATE) | (1<<BAD_ASCIIDECODE)|
1138
(1<<UNTERMINATED_OBJ_DICT) | (1<<UNKNOWN_FILTER)))) {
1143
cli_dbgmsg("cli_pdf: returning %d\n", rc);
1149
static int try_flatedecode(unsigned char *buf, off_t real_len, off_t calculated_len, int fout, cli_ctx *ctx);
1150
static int flatedecode(unsigned char *buf, off_t len, int fout, cli_ctx *ctx);
64
1152
cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)