Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4680 right-hear 1
#include "fitz.h"
2
#include "mupdf.h"
3
 
4
/* Scan file for objects and reconstruct xref table */
5
 
6
struct entry
7
{
8
	int num;
9
	int gen;
10
	int ofs;
11
	int stm_ofs;
12
	int stm_len;
13
};
14
 
15
static fz_error
16
pdf_repair_obj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp, fz_obj **encrypt, fz_obj **id)
17
{
18
	fz_error error;
19
	int tok;
20
	int stm_len;
21
	int len;
22
	int n;
23
 
24
	*stmofsp = 0;
25
	*stmlenp = -1;
26
 
27
	stm_len = 0;
28
 
29
	error = pdf_lex(&tok, file, buf, cap, &len);
30
	if (error)
31
		return fz_rethrow(error, "cannot parse object");
32
	if (tok == PDF_TOK_OPEN_DICT)
33
	{
34
		fz_obj *dict, *obj;
35
 
36
		/* Send NULL xref so we don't try to resolve references */
37
		error = pdf_parse_dict(&dict, NULL, file, buf, cap);
38
		if (error)
39
			return fz_rethrow(error, "cannot parse object");
40
 
41
		obj = fz_dict_gets(dict, "Type");
42
		if (fz_is_name(obj) && !strcmp(fz_to_name(obj), "XRef"))
43
		{
44
			obj = fz_dict_gets(dict, "Encrypt");
45
			if (obj)
46
			{
47
				if (*encrypt)
48
					fz_drop_obj(*encrypt);
49
				*encrypt = fz_keep_obj(obj);
50
			}
51
 
52
			obj = fz_dict_gets(dict, "ID");
53
			if (obj)
54
			{
55
				if (*id)
56
					fz_drop_obj(*id);
57
				*id = fz_keep_obj(obj);
58
			}
59
		}
60
 
61
		obj = fz_dict_gets(dict, "Length");
62
		if (fz_is_int(obj))
63
			stm_len = fz_to_int(obj);
64
 
65
		fz_drop_obj(dict);
66
	}
67
 
68
	while ( tok != PDF_TOK_STREAM &&
69
		tok != PDF_TOK_ENDOBJ &&
70
		tok != PDF_TOK_ERROR &&
71
		tok != PDF_TOK_EOF )
72
	{
73
		error = pdf_lex(&tok, file, buf, cap, &len);
74
		if (error)
75
			return fz_rethrow(error, "cannot scan for endobj or stream token");
76
	}
77
 
78
	if (tok == PDF_TOK_STREAM)
79
	{
80
		int c = fz_read_byte(file);
81
		if (c == '\r') {
82
			c = fz_peek_byte(file);
83
			if (c == '\n')
84
				fz_read_byte(file);
85
		}
86
 
87
		*stmofsp = fz_tell(file);
88
		if (*stmofsp < 0)
89
			return fz_throw("cannot seek in file");
90
 
91
		if (stm_len > 0)
92
		{
93
			fz_seek(file, *stmofsp + stm_len, 0);
94
			error = pdf_lex(&tok, file, buf, cap, &len);
95
			if (error)
96
				fz_catch(error, "cannot find endstream token, falling back to scanning");
97
			if (tok == PDF_TOK_ENDSTREAM)
98
				goto atobjend;
99
			fz_seek(file, *stmofsp, 0);
100
		}
101
 
102
		n = fz_read(file, (unsigned char *) buf, 9);
103
		if (n < 0)
104
			return fz_rethrow(n, "cannot read from file");
105
 
106
		while (memcmp(buf, "endstream", 9) != 0)
107
		{
108
			c = fz_read_byte(file);
109
			if (c == EOF)
110
				break;
111
			memmove(buf, buf + 1, 8);
112
			buf[8] = c;
113
		}
114
 
115
		*stmlenp = fz_tell(file) - *stmofsp - 9;
116
 
117
atobjend:
118
		error = pdf_lex(&tok, file, buf, cap, &len);
119
		if (error)
120
			return fz_rethrow(error, "cannot scan for endobj token");
121
		if (tok != PDF_TOK_ENDOBJ)
122
			fz_warn("object missing 'endobj' token");
123
	}
124
 
125
	return fz_okay;
126
}
127
 
128
static fz_error
129
pdf_repair_obj_stm(pdf_xref *xref, int num, int gen)
130
{
131
	fz_error error;
132
	fz_obj *obj;
133
	fz_stream *stm;
134
	int tok;
135
	int i, n, count;
136
	char buf[256];
137
 
138
	error = pdf_load_object(&obj, xref, num, gen);
139
	if (error)
140
		return fz_rethrow(error, "cannot load object stream object (%d %d R)", num, gen);
141
 
142
	count = fz_to_int(fz_dict_gets(obj, "N"));
143
 
144
	fz_drop_obj(obj);
145
 
146
	error = pdf_open_stream(&stm, xref, num, gen);
147
	if (error)
148
		return fz_rethrow(error, "cannot open object stream object (%d %d R)", num, gen);
149
 
150
	for (i = 0; i < count; i++)
151
	{
152
		error = pdf_lex(&tok, stm, buf, sizeof buf, &n);
153
		if (error || tok != PDF_TOK_INT)
154
		{
155
			fz_close(stm);
156
			return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen);
157
		}
158
 
159
		n = atoi(buf);
160
		if (n >= xref->len)
161
			pdf_resize_xref(xref, n + 1);
162
 
163
		xref->table[n].ofs = num;
164
		xref->table[n].gen = i;
165
		xref->table[n].stm_ofs = 0;
166
		xref->table[n].obj = NULL;
167
		xref->table[n].type = 'o';
168
 
169
		error = pdf_lex(&tok, stm, buf, sizeof buf, &n);
170
		if (error || tok != PDF_TOK_INT)
171
		{
172
			fz_close(stm);
173
			return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen);
174
		}
175
	}
176
 
177
	fz_close(stm);
178
	return fz_okay;
179
}
180
 
181
fz_error
182
pdf_repair_xref(pdf_xref *xref, char *buf, int bufsize)
183
{
184
	fz_error error;
185
	fz_obj *dict, *obj;
186
	fz_obj *length;
187
 
188
	fz_obj *encrypt = NULL;
189
	fz_obj *id = NULL;
190
	fz_obj *root = NULL;
191
	fz_obj *info = NULL;
192
 
193
	struct entry *list = NULL;
194
	int listlen;
195
	int listcap;
196
	int maxnum = 0;
197
 
198
	int num = 0;
199
	int gen = 0;
200
	int tmpofs, numofs = 0, genofs = 0;
201
	int stm_len, stm_ofs = 0;
202
	int tok;
203
	int next;
204
	int i, n, c;
205
 
206
	fz_seek(xref->file, 0, 0);
207
 
208
	listlen = 0;
209
	listcap = 1024;
210
	list = fz_calloc(listcap, sizeof(struct entry));
211
 
212
	/* look for '%PDF' version marker within first kilobyte of file */
213
	n = fz_read(xref->file, (unsigned char *)buf, MAX(bufsize, 1024));
214
	if (n < 0)
215
	{
216
		error = fz_rethrow(n, "cannot read from file");
217
		goto cleanup;
218
	}
219
 
220
	fz_seek(xref->file, 0, 0);
221
	for (i = 0; i < n - 4; i++)
222
	{
223
		if (memcmp(buf + i, "%PDF", 4) == 0)
224
		{
225
			fz_seek(xref->file, i + 8, 0); /* skip "%PDF-X.Y" */
226
			break;
227
		}
228
	}
229
 
230
	/* skip comment line after version marker since some generators
231
	 * forget to terminate the comment with a newline */
232
	c = fz_read_byte(xref->file);
233
	while (c >= 0 && (c == ' ' || c == '%'))
234
		c = fz_read_byte(xref->file);
235
	fz_unread_byte(xref->file);
236
 
237
	while (1)
238
	{
239
		tmpofs = fz_tell(xref->file);
240
		if (tmpofs < 0)
241
		{
242
			error = fz_throw("cannot tell in file");
243
			goto cleanup;
244
		}
245
 
246
		error = pdf_lex(&tok, xref->file, buf, bufsize, &n);
247
		if (error)
248
		{
249
			fz_catch(error, "ignoring the rest of the file");
250
			break;
251
		}
252
 
253
		if (tok == PDF_TOK_INT)
254
		{
255
			numofs = genofs;
256
			num = gen;
257
			genofs = tmpofs;
258
			gen = atoi(buf);
259
		}
260
 
261
		else if (tok == PDF_TOK_OBJ)
262
		{
263
			error = pdf_repair_obj(xref->file, buf, bufsize, &stm_ofs, &stm_len, &encrypt, &id);
264
			if (error)
265
			{
266
				error = fz_rethrow(error, "cannot parse object (%d %d R)", num, gen);
267
				goto cleanup;
268
			}
269
 
270
			if (listlen + 1 == listcap)
271
			{
272
				listcap = (listcap * 3) / 2;
273
				list = fz_realloc(list, listcap, sizeof(struct entry));
274
			}
275
 
276
			list[listlen].num = num;
277
			list[listlen].gen = gen;
278
			list[listlen].ofs = numofs;
279
			list[listlen].stm_ofs = stm_ofs;
280
			list[listlen].stm_len = stm_len;
281
			listlen ++;
282
 
283
			if (num > maxnum)
284
				maxnum = num;
285
		}
286
 
287
		/* trailer dictionary */
288
		else if (tok == PDF_TOK_OPEN_DICT)
289
		{
290
			error = pdf_parse_dict(&dict, xref, xref->file, buf, bufsize);
291
			if (error)
292
			{
293
				error = fz_rethrow(error, "cannot parse object");
294
				goto cleanup;
295
			}
296
 
297
			obj = fz_dict_gets(dict, "Encrypt");
298
			if (obj)
299
			{
300
				if (encrypt)
301
					fz_drop_obj(encrypt);
302
				encrypt = fz_keep_obj(obj);
303
			}
304
 
305
			obj = fz_dict_gets(dict, "ID");
306
			if (obj)
307
			{
308
				if (id)
309
					fz_drop_obj(id);
310
				id = fz_keep_obj(obj);
311
			}
312
 
313
			obj = fz_dict_gets(dict, "Root");
314
			if (obj)
315
			{
316
				if (root)
317
					fz_drop_obj(root);
318
				root = fz_keep_obj(obj);
319
			}
320
 
321
			obj = fz_dict_gets(dict, "Info");
322
			if (obj)
323
			{
324
				if (info)
325
					fz_drop_obj(info);
326
				info = fz_keep_obj(obj);
327
			}
328
 
329
			fz_drop_obj(dict);
330
		}
331
 
332
		else if (tok == PDF_TOK_ERROR)
333
			fz_read_byte(xref->file);
334
 
335
		else if (tok == PDF_TOK_EOF)
336
			break;
337
	}
338
 
339
	/* make xref reasonable */
340
 
341
	pdf_resize_xref(xref, maxnum + 1);
342
 
343
	for (i = 0; i < listlen; i++)
344
	{
345
		xref->table[list[i].num].type = 'n';
346
		xref->table[list[i].num].ofs = list[i].ofs;
347
		xref->table[list[i].num].gen = list[i].gen;
348
 
349
		xref->table[list[i].num].stm_ofs = list[i].stm_ofs;
350
 
351
		/* corrected stream length */
352
		if (list[i].stm_len >= 0)
353
		{
354
			error = pdf_load_object(&dict, xref, list[i].num, list[i].gen);
355
			if (error)
356
			{
357
				error = fz_rethrow(error, "cannot load stream object (%d %d R)", list[i].num, list[i].gen);
358
				goto cleanup;
359
			}
360
 
361
			length = fz_new_int(list[i].stm_len);
362
			fz_dict_puts(dict, "Length", length);
363
			fz_drop_obj(length);
364
 
365
			fz_drop_obj(dict);
366
		}
367
 
368
	}
369
 
370
	xref->table[0].type = 'f';
371
	xref->table[0].ofs = 0;
372
	xref->table[0].gen = 65535;
373
	xref->table[0].stm_ofs = 0;
374
	xref->table[0].obj = NULL;
375
 
376
	next = 0;
377
	for (i = xref->len - 1; i >= 0; i--)
378
	{
379
		if (xref->table[i].type == 'f')
380
		{
381
			xref->table[i].ofs = next;
382
			if (xref->table[i].gen < 65535)
383
				xref->table[i].gen ++;
384
			next = i;
385
		}
386
	}
387
 
388
	/* create a repaired trailer, Root will be added later */
389
 
390
	xref->trailer = fz_new_dict(5);
391
 
392
	obj = fz_new_int(maxnum + 1);
393
	fz_dict_puts(xref->trailer, "Size", obj);
394
	fz_drop_obj(obj);
395
 
396
	if (root)
397
	{
398
		fz_dict_puts(xref->trailer, "Root", root);
399
		fz_drop_obj(root);
400
	}
401
	if (info)
402
	{
403
		fz_dict_puts(xref->trailer, "Info", info);
404
		fz_drop_obj(info);
405
	}
406
 
407
	if (encrypt)
408
	{
409
		if (fz_is_indirect(encrypt))
410
		{
411
			/* create new reference with non-NULL xref pointer */
412
			obj = fz_new_indirect(fz_to_num(encrypt), fz_to_gen(encrypt), xref);
413
			fz_drop_obj(encrypt);
414
			encrypt = obj;
415
		}
416
		fz_dict_puts(xref->trailer, "Encrypt", encrypt);
417
		fz_drop_obj(encrypt);
418
	}
419
 
420
	if (id)
421
	{
422
		if (fz_is_indirect(id))
423
		{
424
			/* create new reference with non-NULL xref pointer */
425
			obj = fz_new_indirect(fz_to_num(id), fz_to_gen(id), xref);
426
			fz_drop_obj(id);
427
			id = obj;
428
		}
429
		fz_dict_puts(xref->trailer, "ID", id);
430
		fz_drop_obj(id);
431
	}
432
 
433
	fz_free(list);
434
	return fz_okay;
435
 
436
cleanup:
437
	if (encrypt) fz_drop_obj(encrypt);
438
	if (id) fz_drop_obj(id);
439
	if (root) fz_drop_obj(root);
440
	if (info) fz_drop_obj(info);
441
	fz_free(list);
442
	return error; /* already rethrown */
443
}
444
 
445
fz_error
446
pdf_repair_obj_stms(pdf_xref *xref)
447
{
448
	fz_obj *dict;
449
	int i;
450
 
451
	for (i = 0; i < xref->len; i++)
452
	{
453
		if (xref->table[i].stm_ofs)
454
		{
455
			pdf_load_object(&dict, xref, i, 0);
456
			if (!strcmp(fz_to_name(fz_dict_gets(dict, "Type")), "ObjStm"))
457
				pdf_repair_obj_stm(xref, i, 0);
458
			fz_drop_obj(dict);
459
		}
460
	}
461
 
462
	return fz_okay;
463
}