Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4680 | right-hear | 1 | #include "fitz.h" |
2 | #include "mupdf.h" |
||
3 | |||
4 | /* Scan file for objects and reconstruct xref table */ |
||
5 | |||
6 | struct entry |
||
7 | { |
||
8 | int num; |
||
9 | int gen; |
||
10 | int ofs; |
||
11 | int stm_ofs; |
||
12 | int stm_len; |
||
13 | }; |
||
14 | |||
15 | static fz_error |
||
16 | pdf_repair_obj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp, fz_obj **encrypt, fz_obj **id) |
||
17 | { |
||
18 | fz_error error; |
||
19 | int tok; |
||
20 | int stm_len; |
||
21 | int len; |
||
22 | int n; |
||
23 | |||
24 | *stmofsp = 0; |
||
25 | *stmlenp = -1; |
||
26 | |||
27 | stm_len = 0; |
||
28 | |||
29 | error = pdf_lex(&tok, file, buf, cap, &len); |
||
30 | if (error) |
||
31 | return fz_rethrow(error, "cannot parse object"); |
||
32 | if (tok == PDF_TOK_OPEN_DICT) |
||
33 | { |
||
34 | fz_obj *dict, *obj; |
||
35 | |||
36 | /* Send NULL xref so we don't try to resolve references */ |
||
37 | error = pdf_parse_dict(&dict, NULL, file, buf, cap); |
||
38 | if (error) |
||
39 | return fz_rethrow(error, "cannot parse object"); |
||
40 | |||
41 | obj = fz_dict_gets(dict, "Type"); |
||
42 | if (fz_is_name(obj) && !strcmp(fz_to_name(obj), "XRef")) |
||
43 | { |
||
44 | obj = fz_dict_gets(dict, "Encrypt"); |
||
45 | if (obj) |
||
46 | { |
||
47 | if (*encrypt) |
||
48 | fz_drop_obj(*encrypt); |
||
49 | *encrypt = fz_keep_obj(obj); |
||
50 | } |
||
51 | |||
52 | obj = fz_dict_gets(dict, "ID"); |
||
53 | if (obj) |
||
54 | { |
||
55 | if (*id) |
||
56 | fz_drop_obj(*id); |
||
57 | *id = fz_keep_obj(obj); |
||
58 | } |
||
59 | } |
||
60 | |||
61 | obj = fz_dict_gets(dict, "Length"); |
||
62 | if (fz_is_int(obj)) |
||
63 | stm_len = fz_to_int(obj); |
||
64 | |||
65 | fz_drop_obj(dict); |
||
66 | } |
||
67 | |||
68 | while ( tok != PDF_TOK_STREAM && |
||
69 | tok != PDF_TOK_ENDOBJ && |
||
70 | tok != PDF_TOK_ERROR && |
||
71 | tok != PDF_TOK_EOF ) |
||
72 | { |
||
73 | error = pdf_lex(&tok, file, buf, cap, &len); |
||
74 | if (error) |
||
75 | return fz_rethrow(error, "cannot scan for endobj or stream token"); |
||
76 | } |
||
77 | |||
78 | if (tok == PDF_TOK_STREAM) |
||
79 | { |
||
80 | int c = fz_read_byte(file); |
||
81 | if (c == '\r') { |
||
82 | c = fz_peek_byte(file); |
||
83 | if (c == '\n') |
||
84 | fz_read_byte(file); |
||
85 | } |
||
86 | |||
87 | *stmofsp = fz_tell(file); |
||
88 | if (*stmofsp < 0) |
||
89 | return fz_throw("cannot seek in file"); |
||
90 | |||
91 | if (stm_len > 0) |
||
92 | { |
||
93 | fz_seek(file, *stmofsp + stm_len, 0); |
||
94 | error = pdf_lex(&tok, file, buf, cap, &len); |
||
95 | if (error) |
||
96 | fz_catch(error, "cannot find endstream token, falling back to scanning"); |
||
97 | if (tok == PDF_TOK_ENDSTREAM) |
||
98 | goto atobjend; |
||
99 | fz_seek(file, *stmofsp, 0); |
||
100 | } |
||
101 | |||
102 | n = fz_read(file, (unsigned char *) buf, 9); |
||
103 | if (n < 0) |
||
104 | return fz_rethrow(n, "cannot read from file"); |
||
105 | |||
106 | while (memcmp(buf, "endstream", 9) != 0) |
||
107 | { |
||
108 | c = fz_read_byte(file); |
||
109 | if (c == EOF) |
||
110 | break; |
||
111 | memmove(buf, buf + 1, 8); |
||
112 | buf[8] = c; |
||
113 | } |
||
114 | |||
115 | *stmlenp = fz_tell(file) - *stmofsp - 9; |
||
116 | |||
117 | atobjend: |
||
118 | error = pdf_lex(&tok, file, buf, cap, &len); |
||
119 | if (error) |
||
120 | return fz_rethrow(error, "cannot scan for endobj token"); |
||
121 | if (tok != PDF_TOK_ENDOBJ) |
||
122 | fz_warn("object missing 'endobj' token"); |
||
123 | } |
||
124 | |||
125 | return fz_okay; |
||
126 | } |
||
127 | |||
128 | static fz_error |
||
129 | pdf_repair_obj_stm(pdf_xref *xref, int num, int gen) |
||
130 | { |
||
131 | fz_error error; |
||
132 | fz_obj *obj; |
||
133 | fz_stream *stm; |
||
134 | int tok; |
||
135 | int i, n, count; |
||
136 | char buf[256]; |
||
137 | |||
138 | error = pdf_load_object(&obj, xref, num, gen); |
||
139 | if (error) |
||
140 | return fz_rethrow(error, "cannot load object stream object (%d %d R)", num, gen); |
||
141 | |||
142 | count = fz_to_int(fz_dict_gets(obj, "N")); |
||
143 | |||
144 | fz_drop_obj(obj); |
||
145 | |||
146 | error = pdf_open_stream(&stm, xref, num, gen); |
||
147 | if (error) |
||
148 | return fz_rethrow(error, "cannot open object stream object (%d %d R)", num, gen); |
||
149 | |||
150 | for (i = 0; i < count; i++) |
||
151 | { |
||
152 | error = pdf_lex(&tok, stm, buf, sizeof buf, &n); |
||
153 | if (error || tok != PDF_TOK_INT) |
||
154 | { |
||
155 | fz_close(stm); |
||
156 | return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen); |
||
157 | } |
||
158 | |||
159 | n = atoi(buf); |
||
160 | if (n >= xref->len) |
||
161 | pdf_resize_xref(xref, n + 1); |
||
162 | |||
163 | xref->table[n].ofs = num; |
||
164 | xref->table[n].gen = i; |
||
165 | xref->table[n].stm_ofs = 0; |
||
166 | xref->table[n].obj = NULL; |
||
167 | xref->table[n].type = 'o'; |
||
168 | |||
169 | error = pdf_lex(&tok, stm, buf, sizeof buf, &n); |
||
170 | if (error || tok != PDF_TOK_INT) |
||
171 | { |
||
172 | fz_close(stm); |
||
173 | return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen); |
||
174 | } |
||
175 | } |
||
176 | |||
177 | fz_close(stm); |
||
178 | return fz_okay; |
||
179 | } |
||
180 | |||
181 | fz_error |
||
182 | pdf_repair_xref(pdf_xref *xref, char *buf, int bufsize) |
||
183 | { |
||
184 | fz_error error; |
||
185 | fz_obj *dict, *obj; |
||
186 | fz_obj *length; |
||
187 | |||
188 | fz_obj *encrypt = NULL; |
||
189 | fz_obj *id = NULL; |
||
190 | fz_obj *root = NULL; |
||
191 | fz_obj *info = NULL; |
||
192 | |||
193 | struct entry *list = NULL; |
||
194 | int listlen; |
||
195 | int listcap; |
||
196 | int maxnum = 0; |
||
197 | |||
198 | int num = 0; |
||
199 | int gen = 0; |
||
200 | int tmpofs, numofs = 0, genofs = 0; |
||
201 | int stm_len, stm_ofs = 0; |
||
202 | int tok; |
||
203 | int next; |
||
204 | int i, n, c; |
||
205 | |||
206 | fz_seek(xref->file, 0, 0); |
||
207 | |||
208 | listlen = 0; |
||
209 | listcap = 1024; |
||
210 | list = fz_calloc(listcap, sizeof(struct entry)); |
||
211 | |||
212 | /* look for '%PDF' version marker within first kilobyte of file */ |
||
213 | n = fz_read(xref->file, (unsigned char *)buf, MAX(bufsize, 1024)); |
||
214 | if (n < 0) |
||
215 | { |
||
216 | error = fz_rethrow(n, "cannot read from file"); |
||
217 | goto cleanup; |
||
218 | } |
||
219 | |||
220 | fz_seek(xref->file, 0, 0); |
||
221 | for (i = 0; i < n - 4; i++) |
||
222 | { |
||
223 | if (memcmp(buf + i, "%PDF", 4) == 0) |
||
224 | { |
||
225 | fz_seek(xref->file, i + 8, 0); /* skip "%PDF-X.Y" */ |
||
226 | break; |
||
227 | } |
||
228 | } |
||
229 | |||
230 | /* skip comment line after version marker since some generators |
||
231 | * forget to terminate the comment with a newline */ |
||
232 | c = fz_read_byte(xref->file); |
||
233 | while (c >= 0 && (c == ' ' || c == '%')) |
||
234 | c = fz_read_byte(xref->file); |
||
235 | fz_unread_byte(xref->file); |
||
236 | |||
237 | while (1) |
||
238 | { |
||
239 | tmpofs = fz_tell(xref->file); |
||
240 | if (tmpofs < 0) |
||
241 | { |
||
242 | error = fz_throw("cannot tell in file"); |
||
243 | goto cleanup; |
||
244 | } |
||
245 | |||
246 | error = pdf_lex(&tok, xref->file, buf, bufsize, &n); |
||
247 | if (error) |
||
248 | { |
||
249 | fz_catch(error, "ignoring the rest of the file"); |
||
250 | break; |
||
251 | } |
||
252 | |||
253 | if (tok == PDF_TOK_INT) |
||
254 | { |
||
255 | numofs = genofs; |
||
256 | num = gen; |
||
257 | genofs = tmpofs; |
||
258 | gen = atoi(buf); |
||
259 | } |
||
260 | |||
261 | else if (tok == PDF_TOK_OBJ) |
||
262 | { |
||
263 | error = pdf_repair_obj(xref->file, buf, bufsize, &stm_ofs, &stm_len, &encrypt, &id); |
||
264 | if (error) |
||
265 | { |
||
266 | error = fz_rethrow(error, "cannot parse object (%d %d R)", num, gen); |
||
267 | goto cleanup; |
||
268 | } |
||
269 | |||
270 | if (listlen + 1 == listcap) |
||
271 | { |
||
272 | listcap = (listcap * 3) / 2; |
||
273 | list = fz_realloc(list, listcap, sizeof(struct entry)); |
||
274 | } |
||
275 | |||
276 | list[listlen].num = num; |
||
277 | list[listlen].gen = gen; |
||
278 | list[listlen].ofs = numofs; |
||
279 | list[listlen].stm_ofs = stm_ofs; |
||
280 | list[listlen].stm_len = stm_len; |
||
281 | listlen ++; |
||
282 | |||
283 | if (num > maxnum) |
||
284 | maxnum = num; |
||
285 | } |
||
286 | |||
287 | /* trailer dictionary */ |
||
288 | else if (tok == PDF_TOK_OPEN_DICT) |
||
289 | { |
||
290 | error = pdf_parse_dict(&dict, xref, xref->file, buf, bufsize); |
||
291 | if (error) |
||
292 | { |
||
293 | error = fz_rethrow(error, "cannot parse object"); |
||
294 | goto cleanup; |
||
295 | } |
||
296 | |||
297 | obj = fz_dict_gets(dict, "Encrypt"); |
||
298 | if (obj) |
||
299 | { |
||
300 | if (encrypt) |
||
301 | fz_drop_obj(encrypt); |
||
302 | encrypt = fz_keep_obj(obj); |
||
303 | } |
||
304 | |||
305 | obj = fz_dict_gets(dict, "ID"); |
||
306 | if (obj) |
||
307 | { |
||
308 | if (id) |
||
309 | fz_drop_obj(id); |
||
310 | id = fz_keep_obj(obj); |
||
311 | } |
||
312 | |||
313 | obj = fz_dict_gets(dict, "Root"); |
||
314 | if (obj) |
||
315 | { |
||
316 | if (root) |
||
317 | fz_drop_obj(root); |
||
318 | root = fz_keep_obj(obj); |
||
319 | } |
||
320 | |||
321 | obj = fz_dict_gets(dict, "Info"); |
||
322 | if (obj) |
||
323 | { |
||
324 | if (info) |
||
325 | fz_drop_obj(info); |
||
326 | info = fz_keep_obj(obj); |
||
327 | } |
||
328 | |||
329 | fz_drop_obj(dict); |
||
330 | } |
||
331 | |||
332 | else if (tok == PDF_TOK_ERROR) |
||
333 | fz_read_byte(xref->file); |
||
334 | |||
335 | else if (tok == PDF_TOK_EOF) |
||
336 | break; |
||
337 | } |
||
338 | |||
339 | /* make xref reasonable */ |
||
340 | |||
341 | pdf_resize_xref(xref, maxnum + 1); |
||
342 | |||
343 | for (i = 0; i < listlen; i++) |
||
344 | { |
||
345 | xref->table[list[i].num].type = 'n'; |
||
346 | xref->table[list[i].num].ofs = list[i].ofs; |
||
347 | xref->table[list[i].num].gen = list[i].gen; |
||
348 | |||
349 | xref->table[list[i].num].stm_ofs = list[i].stm_ofs; |
||
350 | |||
351 | /* corrected stream length */ |
||
352 | if (list[i].stm_len >= 0) |
||
353 | { |
||
354 | error = pdf_load_object(&dict, xref, list[i].num, list[i].gen); |
||
355 | if (error) |
||
356 | { |
||
357 | error = fz_rethrow(error, "cannot load stream object (%d %d R)", list[i].num, list[i].gen); |
||
358 | goto cleanup; |
||
359 | } |
||
360 | |||
361 | length = fz_new_int(list[i].stm_len); |
||
362 | fz_dict_puts(dict, "Length", length); |
||
363 | fz_drop_obj(length); |
||
364 | |||
365 | fz_drop_obj(dict); |
||
366 | } |
||
367 | |||
368 | } |
||
369 | |||
370 | xref->table[0].type = 'f'; |
||
371 | xref->table[0].ofs = 0; |
||
372 | xref->table[0].gen = 65535; |
||
373 | xref->table[0].stm_ofs = 0; |
||
374 | xref->table[0].obj = NULL; |
||
375 | |||
376 | next = 0; |
||
377 | for (i = xref->len - 1; i >= 0; i--) |
||
378 | { |
||
379 | if (xref->table[i].type == 'f') |
||
380 | { |
||
381 | xref->table[i].ofs = next; |
||
382 | if (xref->table[i].gen < 65535) |
||
383 | xref->table[i].gen ++; |
||
384 | next = i; |
||
385 | } |
||
386 | } |
||
387 | |||
388 | /* create a repaired trailer, Root will be added later */ |
||
389 | |||
390 | xref->trailer = fz_new_dict(5); |
||
391 | |||
392 | obj = fz_new_int(maxnum + 1); |
||
393 | fz_dict_puts(xref->trailer, "Size", obj); |
||
394 | fz_drop_obj(obj); |
||
395 | |||
396 | if (root) |
||
397 | { |
||
398 | fz_dict_puts(xref->trailer, "Root", root); |
||
399 | fz_drop_obj(root); |
||
400 | } |
||
401 | if (info) |
||
402 | { |
||
403 | fz_dict_puts(xref->trailer, "Info", info); |
||
404 | fz_drop_obj(info); |
||
405 | } |
||
406 | |||
407 | if (encrypt) |
||
408 | { |
||
409 | if (fz_is_indirect(encrypt)) |
||
410 | { |
||
411 | /* create new reference with non-NULL xref pointer */ |
||
412 | obj = fz_new_indirect(fz_to_num(encrypt), fz_to_gen(encrypt), xref); |
||
413 | fz_drop_obj(encrypt); |
||
414 | encrypt = obj; |
||
415 | } |
||
416 | fz_dict_puts(xref->trailer, "Encrypt", encrypt); |
||
417 | fz_drop_obj(encrypt); |
||
418 | } |
||
419 | |||
420 | if (id) |
||
421 | { |
||
422 | if (fz_is_indirect(id)) |
||
423 | { |
||
424 | /* create new reference with non-NULL xref pointer */ |
||
425 | obj = fz_new_indirect(fz_to_num(id), fz_to_gen(id), xref); |
||
426 | fz_drop_obj(id); |
||
427 | id = obj; |
||
428 | } |
||
429 | fz_dict_puts(xref->trailer, "ID", id); |
||
430 | fz_drop_obj(id); |
||
431 | } |
||
432 | |||
433 | fz_free(list); |
||
434 | return fz_okay; |
||
435 | |||
436 | cleanup: |
||
437 | if (encrypt) fz_drop_obj(encrypt); |
||
438 | if (id) fz_drop_obj(id); |
||
439 | if (root) fz_drop_obj(root); |
||
440 | if (info) fz_drop_obj(info); |
||
441 | fz_free(list); |
||
442 | return error; /* already rethrown */ |
||
443 | } |
||
444 | |||
445 | fz_error |
||
446 | pdf_repair_obj_stms(pdf_xref *xref) |
||
447 | { |
||
448 | fz_obj *dict; |
||
449 | int i; |
||
450 | |||
451 | for (i = 0; i < xref->len; i++) |
||
452 | { |
||
453 | if (xref->table[i].stm_ofs) |
||
454 | { |
||
455 | pdf_load_object(&dict, xref, i, 0); |
||
456 | if (!strcmp(fz_to_name(fz_dict_gets(dict, "Type")), "ObjStm")) |
||
457 | pdf_repair_obj_stm(xref, i, 0); |
||
458 | fz_drop_obj(dict); |
||
459 | } |
||
460 | } |
||
461 | |||
462 | return fz_okay; |
||
463 | }>>>>>>>>> |