Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4680 right-hear 1
/*
2
 * PDF cleaning tool: general purpose pdf syntax washer.
3
 *
4
 * Rewrite PDF with pretty printed objects.
5
 * Garbage collect unreachable objects.
6
 * Inflate compressed streams.
7
 * Create subset documents.
8
 *
9
 * TODO: linearize document for fast web view
10
 */
11
 
12
#include "fitz.h"
13
#include "mupdf.h"
14
 
15
static FILE *out = NULL;
16
 
17
static char *uselist = NULL;
18
static int *ofslist = NULL;
19
static int *genlist = NULL;
20
static int *renumbermap = NULL;
21
 
22
static int dogarbage = 0;
23
static int doexpand = 0;
24
static int doascii = 0;
25
 
26
static pdf_xref *xref = NULL;
27
 
28
void die(fz_error error)
29
{
30
	fz_catch(error, "aborting");
31
	if (xref)
32
		pdf_free_xref(xref);
33
	exit(1);
34
}
35
 
36
static void usage(void)
37
{
38
	fprintf(stderr,
39
		"usage: pdfclean [options] input.pdf [output.pdf] [pages]\n"
40
		"\t-p -\tpassword\n"
41
		"\t-g\tgarbage collect unused objects\n"
42
		"\t-gg\tin addition to -g compact xref table\n"
43
		"\t-ggg\tin addition to -gg merge duplicate objects\n"
44
		"\t-d\tdecompress streams\n"
45
		"\t-a\tascii hex encode binary streams\n"
46
		"\tpages\tcomma separated list of ranges\n");
47
	exit(1);
48
}
49
 
50
/*
51
 * Garbage collect objects not reachable from the trailer.
52
 */
53
 
54
static void sweepref(fz_obj *ref);
55
 
56
static void sweepobj(fz_obj *obj)
57
{
58
	int i;
59
 
60
	if (fz_is_indirect(obj))
61
		sweepref(obj);
62
 
63
	else if (fz_is_dict(obj))
64
		for (i = 0; i < fz_dict_len(obj); i++)
65
			sweepobj(fz_dict_get_val(obj, i));
66
 
67
	else if (fz_is_array(obj))
68
		for (i = 0; i < fz_array_len(obj); i++)
69
			sweepobj(fz_array_get(obj, i));
70
}
71
 
72
static void sweepref(fz_obj *obj)
73
{
74
	int num = fz_to_num(obj);
75
	int gen = fz_to_gen(obj);
76
 
77
	if (num < 0 || num >= xref->len)
78
		return;
79
	if (uselist[num])
80
		return;
81
 
82
	uselist[num] = 1;
83
 
84
	/* Bake in /Length in stream objects */
85
	if (pdf_is_stream(xref, num, gen))
86
	{
87
		fz_obj *len = fz_dict_gets(obj, "Length");
88
		if (fz_is_indirect(len))
89
		{
90
			uselist[fz_to_num(len)] = 0;
91
			len = fz_resolve_indirect(len);
92
			fz_dict_puts(obj, "Length", len);
93
		}
94
	}
95
 
96
	sweepobj(fz_resolve_indirect(obj));
97
}
98
 
99
/*
100
 * Scan for and remove duplicate objects (slow)
101
 */
102
 
103
static void removeduplicateobjs(void)
104
{
105
	int num, other;
106
 
107
	for (num = 1; num < xref->len; num++)
108
	{
109
		/* Only compare an object to objects preceeding it */
110
		for (other = 1; other < num; other++)
111
		{
112
			fz_obj *a, *b;
113
 
114
			if (num == other || !uselist[num] || !uselist[other])
115
				continue;
116
 
117
			/*
118
			 * Comparing stream objects data contents would take too long.
119
			 *
120
			 * pdf_is_stream calls pdf_cache_object and ensures
121
			 * that the xref table has the objects loaded.
122
			 */
123
			if (pdf_is_stream(xref, num, 0) || pdf_is_stream(xref, other, 0))
124
				continue;
125
 
126
			a = xref->table[num].obj;
127
			b = xref->table[other].obj;
128
 
129
			a = fz_resolve_indirect(a);
130
			b = fz_resolve_indirect(b);
131
 
132
			if (fz_objcmp(a, b))
133
				continue;
134
 
135
			/* Keep the lowest numbered object */
136
			renumbermap[num] = MIN(num, other);
137
			renumbermap[other] = MIN(num, other);
138
			uselist[MAX(num, other)] = 0;
139
 
140
			/* One duplicate was found, do not look for another */
141
			break;
142
		}
143
	}
144
}
145
 
146
/*
147
 * Renumber objects sequentially so the xref is more compact
148
 */
149
 
150
static void compactxref(void)
151
{
152
	int num, newnum;
153
 
154
	/*
155
	 * Update renumbermap in-place, clustering all used
156
	 * objects together at low object ids. Objects that
157
	 * already should be renumbered will have their new
158
	 * object ids be updated to reflect the compaction.
159
	 */
160
 
161
	newnum = 1;
162
	for (num = 1; num < xref->len; num++)
163
	{
164
		if (uselist[num] && renumbermap[num] == num)
165
			renumbermap[num] = newnum++;
166
		else if (renumbermap[num] != num)
167
			renumbermap[num] = renumbermap[renumbermap[num]];
168
	}
169
}
170
 
171
/*
172
 * Update indirect objects according to renumbering established when
173
 * removing duplicate objects and compacting the xref.
174
 */
175
 
176
static void renumberobj(fz_obj *obj)
177
{
178
	int i;
179
 
180
	if (fz_is_dict(obj))
181
	{
182
		for (i = 0; i < fz_dict_len(obj); i++)
183
		{
184
			fz_obj *key = fz_dict_get_key(obj, i);
185
			fz_obj *val = fz_dict_get_val(obj, i);
186
			if (fz_is_indirect(val))
187
			{
188
				val = fz_new_indirect(renumbermap[fz_to_num(val)], 0, xref);
189
				fz_dict_put(obj, key, val);
190
				fz_drop_obj(val);
191
			}
192
			else
193
			{
194
				renumberobj(val);
195
			}
196
		}
197
	}
198
 
199
	else if (fz_is_array(obj))
200
	{
201
		for (i = 0; i < fz_array_len(obj); i++)
202
		{
203
			fz_obj *val = fz_array_get(obj, i);
204
			if (fz_is_indirect(val))
205
			{
206
				val = fz_new_indirect(renumbermap[fz_to_num(val)], 0, xref);
207
				fz_array_put(obj, i, val);
208
				fz_drop_obj(val);
209
			}
210
			else
211
			{
212
				renumberobj(val);
213
			}
214
		}
215
	}
216
}
217
 
218
static void renumberobjs(void)
219
{
220
	pdf_xref_entry *oldxref;
221
	int newlen;
222
	int num;
223
 
224
	/* Apply renumber map to indirect references in all objects in xref */
225
	renumberobj(xref->trailer);
226
	for (num = 0; num < xref->len; num++)
227
	{
228
		fz_obj *obj = xref->table[num].obj;
229
 
230
		if (fz_is_indirect(obj))
231
		{
232
			obj = fz_new_indirect(renumbermap[fz_to_num(obj)], 0, xref);
233
			pdf_update_object(xref, num, 0, obj);
234
			fz_drop_obj(obj);
235
		}
236
		else
237
		{
238
			renumberobj(obj);
239
		}
240
	}
241
 
242
	/* Create new table for the reordered, compacted xref */
243
	oldxref = xref->table;
244
	xref->table = fz_calloc(xref->len, sizeof(pdf_xref_entry));
245
	xref->table[0] = oldxref[0];
246
 
247
	/* Move used objects into the new compacted xref */
248
	newlen = 0;
249
	for (num = 1; num < xref->len; num++)
250
	{
251
		if (uselist[num])
252
		{
253
			if (newlen < renumbermap[num])
254
				newlen = renumbermap[num];
255
			xref->table[renumbermap[num]] = oldxref[num];
256
		}
257
		else
258
		{
259
			if (oldxref[num].obj)
260
				fz_drop_obj(oldxref[num].obj);
261
		}
262
	}
263
 
264
	fz_free(oldxref);
265
 
266
	/* Update the used objects count in compacted xref */
267
	xref->len = newlen + 1;
268
 
269
	/* Update list of used objects to fit with compacted xref */
270
	for (num = 1; num < xref->len; num++)
271
		uselist[num] = 1;
272
}
273
 
274
/*
275
 * Recreate page tree to only retain specified pages.
276
 */
277
 
278
static void retainpages(int argc, char **argv)
279
{
280
	fz_error error;
281
	fz_obj *oldroot, *root, *pages, *kids, *countobj, *parent;
282
 
283
	/* Load the old page tree */
284
	error = pdf_load_page_tree(xref);
285
	if (error)
286
		die(fz_rethrow(error, "cannot load page tree"));
287
 
288
	/* Keep only pages/type entry to avoid references to unretained pages */
289
	oldroot = fz_dict_gets(xref->trailer, "Root");
290
	pages = fz_dict_gets(oldroot, "Pages");
291
 
292
	root = fz_new_dict(2);
293
	fz_dict_puts(root, "Type", fz_dict_gets(oldroot, "Type"));
294
	fz_dict_puts(root, "Pages", fz_dict_gets(oldroot, "Pages"));
295
 
296
	pdf_update_object(xref, fz_to_num(oldroot), fz_to_gen(oldroot), root);
297
 
298
	fz_drop_obj(root);
299
 
300
	/* Create a new kids array with only the pages we want to keep */
301
	parent = fz_new_indirect(fz_to_num(pages), fz_to_gen(pages), xref);
302
	kids = fz_new_array(1);
303
 
304
	/* Retain pages specified */
305
	while (argc - fz_optind)
306
	{
307
		int page, spage, epage;
308
		char *spec, *dash;
309
		char *pagelist = argv[fz_optind];
310
 
311
		spec = fz_strsep(&pagelist, ",");
312
		while (spec)
313
		{
314
			dash = strchr(spec, '-');
315
 
316
			if (dash == spec)
317
				spage = epage = pdf_count_pages(xref);
318
			else
319
				spage = epage = atoi(spec);
320
 
321
			if (dash)
322
			{
323
				if (strlen(dash) > 1)
324
					epage = atoi(dash + 1);
325
				else
326
					epage = pdf_count_pages(xref);
327
			}
328
 
329
			if (spage > epage)
330
				page = spage, spage = epage, epage = page;
331
 
332
			if (spage < 1)
333
				spage = 1;
334
			if (epage > pdf_count_pages(xref))
335
				epage = pdf_count_pages(xref);
336
 
337
			for (page = spage; page <= epage; page++)
338
			{
339
				fz_obj *pageobj = xref->page_objs[page-1];
340
				fz_obj *pageref = xref->page_refs[page-1];
341
 
342
				fz_dict_puts(pageobj, "Parent", parent);
343
 
344
				/* Store page object in new kids array */
345
				fz_array_push(kids, pageref);
346
			}
347
 
348
			spec = fz_strsep(&pagelist, ",");
349
		}
350
 
351
		fz_optind++;
352
	}
353
 
354
	fz_drop_obj(parent);
355
 
356
	/* Update page count and kids array */
357
	countobj = fz_new_int(fz_array_len(kids));
358
	fz_dict_puts(pages, "Count", countobj);
359
	fz_drop_obj(countobj);
360
	fz_dict_puts(pages, "Kids", kids);
361
	fz_drop_obj(kids);
362
}
363
 
364
/*
365
 * Make sure we have loaded objects from object streams.
366
 */
367
 
368
static void preloadobjstms(void)
369
{
370
	fz_error error;
371
	fz_obj *obj;
372
	int num;
373
 
374
	for (num = 0; num < xref->len; num++)
375
	{
376
		if (xref->table[num].type == 'o')
377
		{
378
			error = pdf_load_object(&obj, xref, num, 0);
379
			if (error)
380
				die(error);
381
			fz_drop_obj(obj);
382
		}
383
	}
384
}
385
 
386
/*
387
 * Save streams and objects to the output
388
 */
389
 
390
static inline int isbinary(int c)
391
{
392
	if (c == '\n' || c == '\r' || c == '\t')
393
		return 0;
394
	return c < 32 || c > 127;
395
}
396
 
397
static int isbinarystream(fz_buffer *buf)
398
{
399
	int i;
400
	for (i = 0; i < buf->len; i++)
401
		if (isbinary(buf->data[i]))
402
			return 1;
403
	return 0;
404
}
405
 
406
static fz_buffer *hexbuf(unsigned char *p, int n)
407
{
408
	static const char hex[16] = "0123456789abcdef";
409
	fz_buffer *buf;
410
	int x = 0;
411
 
412
	buf = fz_new_buffer(n * 2 + (n / 32) + 2);
413
 
414
	while (n--)
415
	{
416
		buf->data[buf->len++] = hex[*p >> 4];
417
		buf->data[buf->len++] = hex[*p & 15];
418
		if (++x == 32)
419
		{
420
			buf->data[buf->len++] = '\n';
421
			x = 0;
422
		}
423
		p++;
424
	}
425
 
426
	buf->data[buf->len++] = '>';
427
	buf->data[buf->len++] = '\n';
428
 
429
	return buf;
430
}
431
 
432
static void addhexfilter(fz_obj *dict)
433
{
434
	fz_obj *f, *dp, *newf, *newdp;
435
	fz_obj *ahx, *nullobj;
436
 
437
	ahx = fz_new_name("ASCIIHexDecode");
438
	nullobj = fz_new_null();
439
	newf = newdp = NULL;
440
 
441
	f = fz_dict_gets(dict, "Filter");
442
	dp = fz_dict_gets(dict, "DecodeParms");
443
 
444
	if (fz_is_name(f))
445
	{
446
		newf = fz_new_array(2);
447
		fz_array_push(newf, ahx);
448
		fz_array_push(newf, f);
449
		f = newf;
450
		if (fz_is_dict(dp))
451
		{
452
			newdp = fz_new_array(2);
453
			fz_array_push(newdp, nullobj);
454
			fz_array_push(newdp, dp);
455
			dp = newdp;
456
		}
457
	}
458
	else if (fz_is_array(f))
459
	{
460
		fz_array_insert(f, ahx);
461
		if (fz_is_array(dp))
462
			fz_array_insert(dp, nullobj);
463
	}
464
	else
465
		f = ahx;
466
 
467
	fz_dict_puts(dict, "Filter", f);
468
	if (dp)
469
		fz_dict_puts(dict, "DecodeParms", dp);
470
 
471
	fz_drop_obj(ahx);
472
	fz_drop_obj(nullobj);
473
	if (newf)
474
		fz_drop_obj(newf);
475
	if (newdp)
476
		fz_drop_obj(newdp);
477
}
478
 
479
static void copystream(fz_obj *obj, int num, int gen)
480
{
481
	fz_error error;
482
	fz_buffer *buf, *tmp;
483
	fz_obj *newlen;
484
 
485
	error = pdf_load_raw_stream(&buf, xref, num, gen);
486
	if (error)
487
		die(error);
488
 
489
	if (doascii && isbinarystream(buf))
490
	{
491
		tmp = hexbuf(buf->data, buf->len);
492
		fz_drop_buffer(buf);
493
		buf = tmp;
494
 
495
		addhexfilter(obj);
496
 
497
		newlen = fz_new_int(buf->len);
498
		fz_dict_puts(obj, "Length", newlen);
499
		fz_drop_obj(newlen);
500
	}
501
 
502
	fprintf(out, "%d %d obj\n", num, gen);
503
	fz_fprint_obj(out, obj, !doexpand);
504
	fprintf(out, "stream\n");
505
	fwrite(buf->data, 1, buf->len, out);
506
	fprintf(out, "endstream\nendobj\n\n");
507
 
508
	fz_drop_buffer(buf);
509
}
510
 
511
static void expandstream(fz_obj *obj, int num, int gen)
512
{
513
	fz_error error;
514
	fz_buffer *buf, *tmp;
515
	fz_obj *newlen;
516
 
517
	error = pdf_load_stream(&buf, xref, num, gen);
518
	if (error)
519
		die(error);
520
 
521
	fz_dict_dels(obj, "Filter");
522
	fz_dict_dels(obj, "DecodeParms");
523
 
524
	if (doascii && isbinarystream(buf))
525
	{
526
		tmp = hexbuf(buf->data, buf->len);
527
		fz_drop_buffer(buf);
528
		buf = tmp;
529
 
530
		addhexfilter(obj);
531
	}
532
 
533
	newlen = fz_new_int(buf->len);
534
	fz_dict_puts(obj, "Length", newlen);
535
	fz_drop_obj(newlen);
536
 
537
	fprintf(out, "%d %d obj\n", num, gen);
538
	fz_fprint_obj(out, obj, !doexpand);
539
	fprintf(out, "stream\n");
540
	fwrite(buf->data, 1, buf->len, out);
541
	fprintf(out, "endstream\nendobj\n\n");
542
 
543
	fz_drop_buffer(buf);
544
}
545
 
546
static void writeobject(int num, int gen)
547
{
548
	fz_error error;
549
	fz_obj *obj;
550
	fz_obj *type;
551
 
552
	error = pdf_load_object(&obj, xref, num, gen);
553
	if (error)
554
		die(error);
555
 
556
	/* skip ObjStm and XRef objects */
557
	if (fz_is_dict(obj))
558
	{
559
		type = fz_dict_gets(obj, "Type");
560
		if (fz_is_name(type) && !strcmp(fz_to_name(type), "ObjStm"))
561
		{
562
			uselist[num] = 0;
563
			fz_drop_obj(obj);
564
			return;
565
		}
566
		if (fz_is_name(type) && !strcmp(fz_to_name(type), "XRef"))
567
		{
568
			uselist[num] = 0;
569
			fz_drop_obj(obj);
570
			return;
571
		}
572
	}
573
 
574
	if (!pdf_is_stream(xref, num, gen))
575
	{
576
		fprintf(out, "%d %d obj\n", num, gen);
577
		fz_fprint_obj(out, obj, !doexpand);
578
		fprintf(out, "endobj\n\n");
579
	}
580
	else
581
	{
582
		if (doexpand && !pdf_is_jpx_image(obj))
583
			expandstream(obj, num, gen);
584
		else
585
			copystream(obj, num, gen);
586
	}
587
 
588
	fz_drop_obj(obj);
589
}
590
 
591
static void writexref(void)
592
{
593
	fz_obj *trailer;
594
	fz_obj *obj;
595
	int startxref;
596
	int num;
597
 
598
	startxref = ftell(out);
599
 
600
	fprintf(out, "xref\n0 %d\n", xref->len);
601
	for (num = 0; num < xref->len; num++)
602
	{
603
		if (uselist[num])
604
			fprintf(out, "%010d %05d n \n", ofslist[num], genlist[num]);
605
		else
606
			fprintf(out, "%010d %05d f \n", ofslist[num], genlist[num]);
607
	}
608
	fprintf(out, "\n");
609
 
610
	trailer = fz_new_dict(5);
611
 
612
	obj = fz_new_int(xref->len);
613
	fz_dict_puts(trailer, "Size", obj);
614
	fz_drop_obj(obj);
615
 
616
	obj = fz_dict_gets(xref->trailer, "Info");
617
	if (obj)
618
		fz_dict_puts(trailer, "Info", obj);
619
 
620
	obj = fz_dict_gets(xref->trailer, "Root");
621
	if (obj)
622
		fz_dict_puts(trailer, "Root", obj);
623
 
624
	obj = fz_dict_gets(xref->trailer, "ID");
625
	if (obj)
626
		fz_dict_puts(trailer, "ID", obj);
627
 
628
	fprintf(out, "trailer\n");
629
	fz_fprint_obj(out, trailer, !doexpand);
630
	fprintf(out, "\n");
631
 
632
	fz_drop_obj(trailer);
633
 
634
	fprintf(out, "startxref\n%d\n%%%%EOF\n", startxref);
635
}
636
 
637
static void writepdf(void)
638
{
639
	int lastfree;
640
	int num;
641
 
642
	for (num = 0; num < xref->len; num++)
643
	{
644
		if (xref->table[num].type == 'f')
645
			genlist[num] = xref->table[num].gen;
646
		if (xref->table[num].type == 'n')
647
			genlist[num] = xref->table[num].gen;
648
		if (xref->table[num].type == 'o')
649
			genlist[num] = 0;
650
 
651
		if (dogarbage && !uselist[num])
652
			continue;
653
 
654
		if (xref->table[num].type == 'n' || xref->table[num].type == 'o')
655
		{
656
			uselist[num] = 1;
657
			ofslist[num] = ftell(out);
658
			writeobject(num, genlist[num]);
659
		}
660
	}
661
 
662
	/* Construct linked list of free object slots */
663
	lastfree = 0;
664
	for (num = 0; num < xref->len; num++)
665
	{
666
		if (!uselist[num])
667
		{
668
			genlist[num]++;
669
			ofslist[lastfree] = num;
670
			lastfree = num;
671
		}
672
	}
673
 
674
	writexref();
675
}
676
 
677
int main(int argc, char **argv)
678
{
679
	fz_error error;
680
	char *infile;
681
	char *outfile = "out.pdf";
682
	char *password = "";
683
	int c, num;
684
	int subset;
685
 
686
	while ((c = fz_getopt(argc, argv, "adgp:")) != -1)
687
	{
688
		switch (c)
689
		{
690
		case 'p': password = fz_optarg; break;
691
		case 'g': dogarbage ++; break;
692
		case 'd': doexpand ++; break;
693
		case 'a': doascii ++; break;
694
		default: usage(); break;
695
		}
696
	}
697
 
698
	if (argc - fz_optind < 1)
699
		usage();
700
 
701
	infile = argv[fz_optind++];
702
 
703
	if (argc - fz_optind > 0 &&
704
		(strstr(argv[fz_optind], ".pdf") || strstr(argv[fz_optind], ".PDF")))
705
	{
706
		outfile = argv[fz_optind++];
707
	}
708
 
709
	subset = 0;
710
	if (argc - fz_optind > 0)
711
		subset = 1;
712
 
713
	error = pdf_open_xref(&xref, infile, password);
714
	if (error)
715
		die(fz_rethrow(error, "cannot open input file '%s'", infile));
716
 
717
	out = fopen(outfile, "wb");
718
	if (!out)
719
		die(fz_throw("cannot open output file '%s'", outfile));
720
 
721
	fprintf(out, "%%PDF-%d.%d\n", xref->version / 10, xref->version % 10);
722
	fprintf(out, "%%\316\274\341\277\246\n\n");
723
 
724
	uselist = fz_calloc(xref->len + 1, sizeof(char));
725
	ofslist = fz_calloc(xref->len + 1, sizeof(int));
726
	genlist = fz_calloc(xref->len + 1, sizeof(int));
727
	renumbermap = fz_calloc(xref->len + 1, sizeof(int));
728
 
729
	for (num = 0; num < xref->len; num++)
730
	{
731
		uselist[num] = 0;
732
		ofslist[num] = 0;
733
		genlist[num] = 0;
734
		renumbermap[num] = num;
735
	}
736
 
737
	/* Make sure any objects hidden in compressed streams have been loaded */
738
	preloadobjstms();
739
 
740
	/* Only retain the specified subset of the pages */
741
	if (subset)
742
		retainpages(argc, argv);
743
 
744
	/* Sweep & mark objects from the trailer */
745
	if (dogarbage >= 1)
746
		sweepobj(xref->trailer);
747
 
748
	/* Coalesce and renumber duplicate objects */
749
	if (dogarbage >= 3)
750
		removeduplicateobjs();
751
 
752
	/* Compact xref by renumbering and removing unused objects */
753
	if (dogarbage >= 2)
754
		compactxref();
755
 
756
	/* Make renumbering affect all indirect references and update xref */
757
	if (dogarbage >= 2)
758
		renumberobjs();
759
 
760
	writepdf();
761
 
762
	if (fclose(out))
763
		die(fz_throw("cannot close output file '%s'", outfile));
764
 
765
	fz_free(uselist);
766
	fz_free(ofslist);
767
	fz_free(genlist);
768
	fz_free(renumbermap);
769
 
770
	pdf_free_xref(xref);
771
 
772
	fz_flush_warnings();
773
 
774
	return 0;
775
}