1
% Copyright (C) 2002 Artifex Software, Inc. All rights reserved.
3
% This software is provided AS-IS with no warranty, either express or
6
% This software is distributed under license and may not be copied,
7
% modified or distributed except as expressly authorized under the terms
8
% of the license contained in the file LICENSE in this distribution.
10
% For more information about licensing, please refer to
11
% http://www.ghostscript.com/licensing/. For information on
12
% commercial licensing, go to http://www.artifex.com/licensing/ or
13
% contact Artifex Software, Inc., 101 Lucas Valley Road #110,
14
% San Rafael, CA 94903, U.S.A., +1(415)492-9861.
16
% $Id: pdf_rbld.ps 8472 2008-01-01 14:28:30Z alexcher $
17
% pdf_rbld.ps - Rebuilding of broken PDF files (xref errors)
19
% This module contains routines that are used if we detect an error
20
% while reading the xref tables. These routines will scan the file and
21
% build an xref table by finding the objects. We also need to find the
22
% appropriate trailer dictionary. Note: One procedure is also used
23
% even if we do not need to rebuild a PDF file.
25
% This module cannot rebuild a PDF file which has had errors created inside
26
% of objects or binary data streams. It often succeeds with files that
27
% have had its end of lines converted between unix and dos versions.
29
% if true --> we have an object with duplicate object and generation numbers.
30
/dup_obj_gen_num false def
32
% Note: This procedure is also used by non-rebuild code.
33
% Store a line in the xref array (Actually Objects and Generations arrays)
34
% <obj num> (strm num> <obj loc> <gen num> <rebuild>
35
% setxrefentry <obj num> strm num> <obj loc> <gen num>
39
dup 65535 or 65535 ne {
40
( **** Warning: Generation number out of 0..65535 range, assuming 0.\n)
44
% We store generation numbers as value + 1
45
% We reserve 0 to indicate an free xref entry
46
1 add % increment generation number
47
% To save space, generations numbers are stored in a lstring unless we
48
% find a generation number greater than 255. If so then transfer to
51
Generations ltype /stringtype eq { % Convert Generations to an larray.
52
larray Generations llength lgrowto dup % Create new larray
53
0 1 2 index llength 1 sub { % Copy from old lstring to new larray
54
Generations 1 index lget lput dup
57
/Generations exch store % Save new Generations larray
60
% Verify that the new values are for a new object. If the current
61
% entry is null then we have a new entry.
62
Objects 4 index lget null eq {
63
ObjectStream 4 index 4 index cvx lput % Save ObjectStream object number
64
Objects 4 index 3 index cvx lput % Save object location
65
Generations 4 index 2 index lput % Save geenration number
67
% Verify that the new entry has at least as high a generaton number
68
% We accept equal entry number because we have found PDF files in
69
% which there are multiple objects with the same object and entry
70
% numbers. The normal xref logic only accepts the first such
71
% entry that it finds. However the 'rebuild PDF' logic can find
72
% both such entries. The correct one is usually the last one.
73
Generations 4 index lget 1 index le {
74
ObjectStream 4 index 4 index cvx lput % Save ObjectStream object number
75
Objects 4 index 3 index cvx lput % Save object location
76
Generations 4 index 2 index lput % Save geenration number
78
% Set error flag if we have equal object and generation numbers
79
Generations 4 index lget 1 index eq { /dup_obj_gen_num true def } if
80
} 8 -1 roll { ifelse } { pop if } ifelse % Run 'else' only when rebuilding.
83
% Print the contents of the xref array. This actually consists of three
84
% arrays (Objects, Generations, and ObjectStream). All three are larrays.
85
% larrays are a special Ghostscript object which can be arrays with more
87
/print_xref % - print_xref -
88
{ 0 1 Objects llength 1 sub % stack: 0 1 <number of objects - 1>
89
{ dup =only % print object number
91
dup Generations exch lget 1 sub =only % print Generation number
93
dup ObjectStream exch lget ==only % print ObjectStream object number
95
Objects exch lget === % print object location
100
% Get token from string and check its type
101
% <string> <type> typed_token <false> % no token or not match
102
% <string> <type> typed_token <obj> <last> <true> % matching token type
103
% Where last is the string remainder
106
token_nofail % get token
108
dup type % stack: type last token type
109
4 -1 roll eq { % stack: last token bool
110
exch true % desired object found - set exit status
112
pop pop false % not type - clear stack, set exit status
115
pop false % no token - pop type, set exit status
116
} ifelse % check if we got token
119
% Allocate space for post_eof_count to be bound into procedures below.
120
/post_eof_count 0 def
122
% We want the location of the trailer dictionary at the start of file.
123
% First we will find the xref. Then we will skip over the xref entries
125
/search_start_trailer % - search_start_trailer <trailer loc>
126
{ % Read the first 300 bytes and check for xref
127
PDFfile 0 setfileposition
128
PDFfile bytesavailable post_eof_count sub % location of end of data
129
300 .min % block size to read
130
dup string 0 1 4 -1 roll 1 sub
131
{ 2 copy PDFfile read pop put pop } for
134
exch pop exch pop length 4 add PDFfile exch setfileposition
135
PDFfile token pop % get starting entry - or 'trailer'
136
(trailer) ne { % if we do not already have 'trailer'
137
PDFfile token pop % get number of entries
138
PDFfile token pop pop % this moves us into the middle of the first entry
139
25 string exch % define working string for readline
140
{ PDFfile 1 index readline pop pop
141
} repeat % skip entries
142
pop % pop working string
143
PDFfile token pop pop % get 'trailer'
144
PDFfile fileposition % get file position
147
pop search_end_trailer % no xref, should not happen, search end of file
151
% We want the location of the trailer dictionary at the end of file.
152
% We will read the last block of data and search for the final occurance
153
% of the word 'trailer'
154
/search_end_trailer % - search_end_trailer <trailer loc>
155
{ % Position to read block of data from the end of the file. Note: We ignore
156
% anything past the last %%EOF since this is not PDF data.
157
PDFfile 0 setfileposition
158
PDFfile bytesavailable post_eof_count sub % location of end of data
159
dup 65535 .min % block size to read
160
% stack: <file end pos> <block size>
161
% move file position to the start of the block
162
2 copy sub PDFfile exch setfileposition
164
dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
165
% search for last occurance of 'trailer'
166
(trailer) { search not { exit } if pop } loop
167
% determine where the trailer is in the file
168
% trailer loc = end loc - remaing string length
172
% We want to find the trailer dictionary. There is a trailer dictionary
173
% for each xref object list. We only want the trailer dictionary associated
174
% with the first xref object list. In theory this can be anywhere in the
175
% file. However since we are trying to repair a broken file, we cannot simply
176
% follow the xref links. So we are falling back to a simple strategy. We
177
% find the specified location of the first xref list. If its location is in
178
% the first half of the file then we search for the first trailer dictionary
179
% at the start of the file. Otherwise we search for the last trailer at the
181
/search_trailer % - search_trailer -
182
{ % Find the 'startxref' and associated position at the end of the file.
183
% Position to read block of data from the end of the file. Note: We
184
% actually end at the end of the last %%EOF since this is the end of the
185
% useful PDF data. (Some files contain trailing garbage.)
186
PDFfile 0 setfileposition
187
PDFfile bytesavailable % size of file
188
post_eof_count sub dup % location of end of last %%EOF
189
dup 4096 .min % block size to read
190
% stack: <useful file size> <useful file size file> <block size>
191
% move file position to the start of the block
192
2 copy sub PDFfile exch setfileposition
194
dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
195
% search for last occurance of 'startxref'
196
false % Assume that startxref not present
198
search not { exit } if % Exit loop when no more startxref's
199
pop 3 -1 roll pop true 3 1 roll % Idicate that we have found starxref
201
exch % Exch last string and 'found' flag
203
% determine where the startxref is in the file
204
% 'startxref' loc = end loc - remaing string length - 9 bytes
206
% move the file to this position and read startxref and position
207
PDFfile exch setfileposition PDFfile token
208
pop pop PDFfile token pop
210
% startxref not found. We will search the end of the file for trailer.
213
% compare xref position to 1/2 the length of the file and search for trailer
214
exch 2 div lt { search_start_trailer } { search_end_trailer } ifelse
216
PDFfile exch setfileposition % set to the specified trailer location
217
/dictlevelcount 0 def
218
PDFfile traileropdict .pdfrun % read trailer info
222
% This routine will determine if there is stuff after the %%EOF. There is
223
% supposed to be only a line termination. However many real life files
224
% contain some garbage. This routine checks how much. We then ignore this
225
% stuff when we are scanning for objects.
226
/determine_post_eof_count % - determine_post_eof_count <count>
227
{ % Position to read block of data from the end of the file.
228
PDFfilelen % size of file
229
dup 4096 .min % block size to read
230
% stack: <file size> <file size> <block size>
231
% move file position to the start of the block
232
2 copy sub PDFfile exch setfileposition
234
dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
235
% search for last occurance of '%%EOF'
236
(%%EOF) { search not { exit } if pop } loop
237
% how much is left = remaining string length
238
length exch pop % pop /%%EOF
241
% This routine will scan a file searaching for object locations to build
242
% an alternate version of the data in the xref tables.
243
% Its purpose is to provide a basis for an xref fixing facility.
244
/search_objects % - search_objects -
245
{ % Initialize the Objects, Generations, etc. larrays
247
% reset duplicate object and generation numbers error flag
248
/dup_obj_gen_num false def
249
% Determine how many bytes are in the file after the final %%EOF
250
/post_eof_count determine_post_eof_count def
251
% Start at the beginning of the file
252
PDFfile 0 setfileposition
253
% Create a working string (and also store its length on stack). We are
254
% using a maximum size string size the logic below wants a recovered object
255
% to fit into our working string.
257
{ % Now loop through the entire file lloking for objects
258
PDFfile fileposition % save current file position
259
% When we get near the end of the file, we use a smaller interval of
260
% our working string to prevent reading past the end. (See comments on
261
% EOF testing below.)
262
PDFfile bytesavailable post_eof_count sub 10 sub dup 4 index lt {
263
2 index 0 3 -1 roll getinterval % near EOF, use interval of string
264
} { pop 1 index % not near end, use full working string
266
% Read a line from file. If the line does not fit into our working string,
267
% or any other error, then we will discard it.
268
PDFfile exch { readline } .internalstopped
269
{ pop pop false } if % indicate no string if we stopped
270
{ % stack: <length> <working_str> <loc> <string>
271
% Now that we have line, get obj num, ref num, and 'obj'. Verify that each
272
% of these is correct type.
273
/integertype typed_token { % get obj number
274
/integertype typed_token { % get ref number
275
/nametype typed_token { % get 'obj' text
276
pop % pop remaining string
277
/obj eq { % verify name is 'obj'
278
% make sure we have room in the arrays. We work in increments
279
% of 20 each time we increase the size.
280
1 index 20 add 20 idiv 20 mul
282
% save xref parameters into ObjectStream, Objects and Generations
283
1 index 0 % rearrange parms for setxrefentry
284
4 index PDFoffset sub 3 index
285
//true setxrefentry % save parameters
286
pop pop pop pop % clear parameters
287
} if % check if name is 'obj'
288
} if % check if we got 'obj" string
289
pop % remove ref number
290
} if % check if we got ref number
291
pop % remove obj number
292
} if % check if we got object number
293
} if % check if got a string from readline
294
pop % remove location
295
% Check if we are approaching the end of the file. We do not want to
296
% read past the end of the file since that closes it. We actually stop
297
% 10-20 bytes early since there cannot be an object that close to the end.
298
% (There is a Trailer dictionary, etc. at the end of the file.)
299
PDFfile bytesavailable post_eof_count sub 20 lt { exit } if
300
} loop % loop through the entire file
301
pop pop % remove working string and its length
302
% Output warning if we have two objects with the same object and generation
305
( **** Warning: There are objects with matching object and generation\n)
307
( **** numbers. The accuracy of the resulting image is unknown.\n)
312
% Print warning message because we found a problem while reading the xref
315
{ ( **** Warning: An error occurred while reading an XREF table.\n)
317
( **** The file has been damaged. This may have been caused\n)
319
( **** by a problem while converting or transfering the file.\n)
321
( **** Ghostscript will attempt to recover the data.\n)
325
% Attempt to recover the XRef data. This is called if we have a failure
326
% while reading the normal XRef tables. This routine usually works
327
% only for pre PDF1.5 versions of PDF files.
328
/recover_xref_data % - recover_xref_data -
329
{ print_xref_warning % Print warning message
330
count pdfemptycount sub { pop } repeat % remove anything left by readxref
331
search_objects % Search for objects