0,0 → 1,775 |
/* |
* PDF cleaning tool: general purpose pdf syntax washer. |
* |
* Rewrite PDF with pretty printed objects. |
* Garbage collect unreachable objects. |
* Inflate compressed streams. |
* Create subset documents. |
* |
* TODO: linearize document for fast web view |
*/ |
|
#include "fitz.h" |
#include "mupdf.h" |
|
static FILE *out = NULL; |
|
static char *uselist = NULL; |
static int *ofslist = NULL; |
static int *genlist = NULL; |
static int *renumbermap = NULL; |
|
static int dogarbage = 0; |
static int doexpand = 0; |
static int doascii = 0; |
|
static pdf_xref *xref = NULL; |
|
void die(fz_error error) |
{ |
fz_catch(error, "aborting"); |
if (xref) |
pdf_free_xref(xref); |
exit(1); |
} |
|
static void usage(void) |
{ |
fprintf(stderr, |
"usage: pdfclean [options] input.pdf [output.pdf] [pages]\n" |
"\t-p -\tpassword\n" |
"\t-g\tgarbage collect unused objects\n" |
"\t-gg\tin addition to -g compact xref table\n" |
"\t-ggg\tin addition to -gg merge duplicate objects\n" |
"\t-d\tdecompress streams\n" |
"\t-a\tascii hex encode binary streams\n" |
"\tpages\tcomma separated list of ranges\n"); |
exit(1); |
} |
|
/* |
* Garbage collect objects not reachable from the trailer. |
*/ |
|
static void sweepref(fz_obj *ref); |
|
static void sweepobj(fz_obj *obj) |
{ |
int i; |
|
if (fz_is_indirect(obj)) |
sweepref(obj); |
|
else if (fz_is_dict(obj)) |
for (i = 0; i < fz_dict_len(obj); i++) |
sweepobj(fz_dict_get_val(obj, i)); |
|
else if (fz_is_array(obj)) |
for (i = 0; i < fz_array_len(obj); i++) |
sweepobj(fz_array_get(obj, i)); |
} |
|
static void sweepref(fz_obj *obj) |
{ |
int num = fz_to_num(obj); |
int gen = fz_to_gen(obj); |
|
if (num < 0 || num >= xref->len) |
return; |
if (uselist[num]) |
return; |
|
uselist[num] = 1; |
|
/* Bake in /Length in stream objects */ |
if (pdf_is_stream(xref, num, gen)) |
{ |
fz_obj *len = fz_dict_gets(obj, "Length"); |
if (fz_is_indirect(len)) |
{ |
uselist[fz_to_num(len)] = 0; |
len = fz_resolve_indirect(len); |
fz_dict_puts(obj, "Length", len); |
} |
} |
|
sweepobj(fz_resolve_indirect(obj)); |
} |
|
/* |
* Scan for and remove duplicate objects (slow) |
*/ |
|
static void removeduplicateobjs(void) |
{ |
int num, other; |
|
for (num = 1; num < xref->len; num++) |
{ |
/* Only compare an object to objects preceeding it */ |
for (other = 1; other < num; other++) |
{ |
fz_obj *a, *b; |
|
if (num == other || !uselist[num] || !uselist[other]) |
continue; |
|
/* |
* Comparing stream objects data contents would take too long. |
* |
* pdf_is_stream calls pdf_cache_object and ensures |
* that the xref table has the objects loaded. |
*/ |
if (pdf_is_stream(xref, num, 0) || pdf_is_stream(xref, other, 0)) |
continue; |
|
a = xref->table[num].obj; |
b = xref->table[other].obj; |
|
a = fz_resolve_indirect(a); |
b = fz_resolve_indirect(b); |
|
if (fz_objcmp(a, b)) |
continue; |
|
/* Keep the lowest numbered object */ |
renumbermap[num] = MIN(num, other); |
renumbermap[other] = MIN(num, other); |
uselist[MAX(num, other)] = 0; |
|
/* One duplicate was found, do not look for another */ |
break; |
} |
} |
} |
|
/* |
* Renumber objects sequentially so the xref is more compact |
*/ |
|
static void compactxref(void) |
{ |
int num, newnum; |
|
/* |
* Update renumbermap in-place, clustering all used |
* objects together at low object ids. Objects that |
* already should be renumbered will have their new |
* object ids be updated to reflect the compaction. |
*/ |
|
newnum = 1; |
for (num = 1; num < xref->len; num++) |
{ |
if (uselist[num] && renumbermap[num] == num) |
renumbermap[num] = newnum++; |
else if (renumbermap[num] != num) |
renumbermap[num] = renumbermap[renumbermap[num]]; |
} |
} |
|
/* |
* Update indirect objects according to renumbering established when |
* removing duplicate objects and compacting the xref. |
*/ |
|
static void renumberobj(fz_obj *obj) |
{ |
int i; |
|
if (fz_is_dict(obj)) |
{ |
for (i = 0; i < fz_dict_len(obj); i++) |
{ |
fz_obj *key = fz_dict_get_key(obj, i); |
fz_obj *val = fz_dict_get_val(obj, i); |
if (fz_is_indirect(val)) |
{ |
val = fz_new_indirect(renumbermap[fz_to_num(val)], 0, xref); |
fz_dict_put(obj, key, val); |
fz_drop_obj(val); |
} |
else |
{ |
renumberobj(val); |
} |
} |
} |
|
else if (fz_is_array(obj)) |
{ |
for (i = 0; i < fz_array_len(obj); i++) |
{ |
fz_obj *val = fz_array_get(obj, i); |
if (fz_is_indirect(val)) |
{ |
val = fz_new_indirect(renumbermap[fz_to_num(val)], 0, xref); |
fz_array_put(obj, i, val); |
fz_drop_obj(val); |
} |
else |
{ |
renumberobj(val); |
} |
} |
} |
} |
|
static void renumberobjs(void) |
{ |
pdf_xref_entry *oldxref; |
int newlen; |
int num; |
|
/* Apply renumber map to indirect references in all objects in xref */ |
renumberobj(xref->trailer); |
for (num = 0; num < xref->len; num++) |
{ |
fz_obj *obj = xref->table[num].obj; |
|
if (fz_is_indirect(obj)) |
{ |
obj = fz_new_indirect(renumbermap[fz_to_num(obj)], 0, xref); |
pdf_update_object(xref, num, 0, obj); |
fz_drop_obj(obj); |
} |
else |
{ |
renumberobj(obj); |
} |
} |
|
/* Create new table for the reordered, compacted xref */ |
oldxref = xref->table; |
xref->table = fz_calloc(xref->len, sizeof(pdf_xref_entry)); |
xref->table[0] = oldxref[0]; |
|
/* Move used objects into the new compacted xref */ |
newlen = 0; |
for (num = 1; num < xref->len; num++) |
{ |
if (uselist[num]) |
{ |
if (newlen < renumbermap[num]) |
newlen = renumbermap[num]; |
xref->table[renumbermap[num]] = oldxref[num]; |
} |
else |
{ |
if (oldxref[num].obj) |
fz_drop_obj(oldxref[num].obj); |
} |
} |
|
fz_free(oldxref); |
|
/* Update the used objects count in compacted xref */ |
xref->len = newlen + 1; |
|
/* Update list of used objects to fit with compacted xref */ |
for (num = 1; num < xref->len; num++) |
uselist[num] = 1; |
} |
|
/* |
* Recreate page tree to only retain specified pages. |
*/ |
|
static void retainpages(int argc, char **argv) |
{ |
fz_error error; |
fz_obj *oldroot, *root, *pages, *kids, *countobj, *parent; |
|
/* Load the old page tree */ |
error = pdf_load_page_tree(xref); |
if (error) |
die(fz_rethrow(error, "cannot load page tree")); |
|
/* Keep only pages/type entry to avoid references to unretained pages */ |
oldroot = fz_dict_gets(xref->trailer, "Root"); |
pages = fz_dict_gets(oldroot, "Pages"); |
|
root = fz_new_dict(2); |
fz_dict_puts(root, "Type", fz_dict_gets(oldroot, "Type")); |
fz_dict_puts(root, "Pages", fz_dict_gets(oldroot, "Pages")); |
|
pdf_update_object(xref, fz_to_num(oldroot), fz_to_gen(oldroot), root); |
|
fz_drop_obj(root); |
|
/* Create a new kids array with only the pages we want to keep */ |
parent = fz_new_indirect(fz_to_num(pages), fz_to_gen(pages), xref); |
kids = fz_new_array(1); |
|
/* Retain pages specified */ |
while (argc - fz_optind) |
{ |
int page, spage, epage; |
char *spec, *dash; |
char *pagelist = argv[fz_optind]; |
|
spec = fz_strsep(&pagelist, ","); |
while (spec) |
{ |
dash = strchr(spec, '-'); |
|
if (dash == spec) |
spage = epage = pdf_count_pages(xref); |
else |
spage = epage = atoi(spec); |
|
if (dash) |
{ |
if (strlen(dash) > 1) |
epage = atoi(dash + 1); |
else |
epage = pdf_count_pages(xref); |
} |
|
if (spage > epage) |
page = spage, spage = epage, epage = page; |
|
if (spage < 1) |
spage = 1; |
if (epage > pdf_count_pages(xref)) |
epage = pdf_count_pages(xref); |
|
for (page = spage; page <= epage; page++) |
{ |
fz_obj *pageobj = xref->page_objs[page-1]; |
fz_obj *pageref = xref->page_refs[page-1]; |
|
fz_dict_puts(pageobj, "Parent", parent); |
|
/* Store page object in new kids array */ |
fz_array_push(kids, pageref); |
} |
|
spec = fz_strsep(&pagelist, ","); |
} |
|
fz_optind++; |
} |
|
fz_drop_obj(parent); |
|
/* Update page count and kids array */ |
countobj = fz_new_int(fz_array_len(kids)); |
fz_dict_puts(pages, "Count", countobj); |
fz_drop_obj(countobj); |
fz_dict_puts(pages, "Kids", kids); |
fz_drop_obj(kids); |
} |
|
/* |
* Make sure we have loaded objects from object streams. |
*/ |
|
static void preloadobjstms(void) |
{ |
fz_error error; |
fz_obj *obj; |
int num; |
|
for (num = 0; num < xref->len; num++) |
{ |
if (xref->table[num].type == 'o') |
{ |
error = pdf_load_object(&obj, xref, num, 0); |
if (error) |
die(error); |
fz_drop_obj(obj); |
} |
} |
} |
|
/* |
* Save streams and objects to the output |
*/ |
|
static inline int isbinary(int c) |
{ |
if (c == '\n' || c == '\r' || c == '\t') |
return 0; |
return c < 32 || c > 127; |
} |
|
static int isbinarystream(fz_buffer *buf) |
{ |
int i; |
for (i = 0; i < buf->len; i++) |
if (isbinary(buf->data[i])) |
return 1; |
return 0; |
} |
|
static fz_buffer *hexbuf(unsigned char *p, int n) |
{ |
static const char hex[16] = "0123456789abcdef"; |
fz_buffer *buf; |
int x = 0; |
|
buf = fz_new_buffer(n * 2 + (n / 32) + 2); |
|
while (n--) |
{ |
buf->data[buf->len++] = hex[*p >> 4]; |
buf->data[buf->len++] = hex[*p & 15]; |
if (++x == 32) |
{ |
buf->data[buf->len++] = '\n'; |
x = 0; |
} |
p++; |
} |
|
buf->data[buf->len++] = '>'; |
buf->data[buf->len++] = '\n'; |
|
return buf; |
} |
|
static void addhexfilter(fz_obj *dict) |
{ |
fz_obj *f, *dp, *newf, *newdp; |
fz_obj *ahx, *nullobj; |
|
ahx = fz_new_name("ASCIIHexDecode"); |
nullobj = fz_new_null(); |
newf = newdp = NULL; |
|
f = fz_dict_gets(dict, "Filter"); |
dp = fz_dict_gets(dict, "DecodeParms"); |
|
if (fz_is_name(f)) |
{ |
newf = fz_new_array(2); |
fz_array_push(newf, ahx); |
fz_array_push(newf, f); |
f = newf; |
if (fz_is_dict(dp)) |
{ |
newdp = fz_new_array(2); |
fz_array_push(newdp, nullobj); |
fz_array_push(newdp, dp); |
dp = newdp; |
} |
} |
else if (fz_is_array(f)) |
{ |
fz_array_insert(f, ahx); |
if (fz_is_array(dp)) |
fz_array_insert(dp, nullobj); |
} |
else |
f = ahx; |
|
fz_dict_puts(dict, "Filter", f); |
if (dp) |
fz_dict_puts(dict, "DecodeParms", dp); |
|
fz_drop_obj(ahx); |
fz_drop_obj(nullobj); |
if (newf) |
fz_drop_obj(newf); |
if (newdp) |
fz_drop_obj(newdp); |
} |
|
static void copystream(fz_obj *obj, int num, int gen) |
{ |
fz_error error; |
fz_buffer *buf, *tmp; |
fz_obj *newlen; |
|
error = pdf_load_raw_stream(&buf, xref, num, gen); |
if (error) |
die(error); |
|
if (doascii && isbinarystream(buf)) |
{ |
tmp = hexbuf(buf->data, buf->len); |
fz_drop_buffer(buf); |
buf = tmp; |
|
addhexfilter(obj); |
|
newlen = fz_new_int(buf->len); |
fz_dict_puts(obj, "Length", newlen); |
fz_drop_obj(newlen); |
} |
|
fprintf(out, "%d %d obj\n", num, gen); |
fz_fprint_obj(out, obj, !doexpand); |
fprintf(out, "stream\n"); |
fwrite(buf->data, 1, buf->len, out); |
fprintf(out, "endstream\nendobj\n\n"); |
|
fz_drop_buffer(buf); |
} |
|
static void expandstream(fz_obj *obj, int num, int gen) |
{ |
fz_error error; |
fz_buffer *buf, *tmp; |
fz_obj *newlen; |
|
error = pdf_load_stream(&buf, xref, num, gen); |
if (error) |
die(error); |
|
fz_dict_dels(obj, "Filter"); |
fz_dict_dels(obj, "DecodeParms"); |
|
if (doascii && isbinarystream(buf)) |
{ |
tmp = hexbuf(buf->data, buf->len); |
fz_drop_buffer(buf); |
buf = tmp; |
|
addhexfilter(obj); |
} |
|
newlen = fz_new_int(buf->len); |
fz_dict_puts(obj, "Length", newlen); |
fz_drop_obj(newlen); |
|
fprintf(out, "%d %d obj\n", num, gen); |
fz_fprint_obj(out, obj, !doexpand); |
fprintf(out, "stream\n"); |
fwrite(buf->data, 1, buf->len, out); |
fprintf(out, "endstream\nendobj\n\n"); |
|
fz_drop_buffer(buf); |
} |
|
static void writeobject(int num, int gen) |
{ |
fz_error error; |
fz_obj *obj; |
fz_obj *type; |
|
error = pdf_load_object(&obj, xref, num, gen); |
if (error) |
die(error); |
|
/* skip ObjStm and XRef objects */ |
if (fz_is_dict(obj)) |
{ |
type = fz_dict_gets(obj, "Type"); |
if (fz_is_name(type) && !strcmp(fz_to_name(type), "ObjStm")) |
{ |
uselist[num] = 0; |
fz_drop_obj(obj); |
return; |
} |
if (fz_is_name(type) && !strcmp(fz_to_name(type), "XRef")) |
{ |
uselist[num] = 0; |
fz_drop_obj(obj); |
return; |
} |
} |
|
if (!pdf_is_stream(xref, num, gen)) |
{ |
fprintf(out, "%d %d obj\n", num, gen); |
fz_fprint_obj(out, obj, !doexpand); |
fprintf(out, "endobj\n\n"); |
} |
else |
{ |
if (doexpand && !pdf_is_jpx_image(obj)) |
expandstream(obj, num, gen); |
else |
copystream(obj, num, gen); |
} |
|
fz_drop_obj(obj); |
} |
|
static void writexref(void) |
{ |
fz_obj *trailer; |
fz_obj *obj; |
int startxref; |
int num; |
|
startxref = ftell(out); |
|
fprintf(out, "xref\n0 %d\n", xref->len); |
for (num = 0; num < xref->len; num++) |
{ |
if (uselist[num]) |
fprintf(out, "%010d %05d n \n", ofslist[num], genlist[num]); |
else |
fprintf(out, "%010d %05d f \n", ofslist[num], genlist[num]); |
} |
fprintf(out, "\n"); |
|
trailer = fz_new_dict(5); |
|
obj = fz_new_int(xref->len); |
fz_dict_puts(trailer, "Size", obj); |
fz_drop_obj(obj); |
|
obj = fz_dict_gets(xref->trailer, "Info"); |
if (obj) |
fz_dict_puts(trailer, "Info", obj); |
|
obj = fz_dict_gets(xref->trailer, "Root"); |
if (obj) |
fz_dict_puts(trailer, "Root", obj); |
|
obj = fz_dict_gets(xref->trailer, "ID"); |
if (obj) |
fz_dict_puts(trailer, "ID", obj); |
|
fprintf(out, "trailer\n"); |
fz_fprint_obj(out, trailer, !doexpand); |
fprintf(out, "\n"); |
|
fz_drop_obj(trailer); |
|
fprintf(out, "startxref\n%d\n%%%%EOF\n", startxref); |
} |
|
static void writepdf(void) |
{ |
int lastfree; |
int num; |
|
for (num = 0; num < xref->len; num++) |
{ |
if (xref->table[num].type == 'f') |
genlist[num] = xref->table[num].gen; |
if (xref->table[num].type == 'n') |
genlist[num] = xref->table[num].gen; |
if (xref->table[num].type == 'o') |
genlist[num] = 0; |
|
if (dogarbage && !uselist[num]) |
continue; |
|
if (xref->table[num].type == 'n' || xref->table[num].type == 'o') |
{ |
uselist[num] = 1; |
ofslist[num] = ftell(out); |
writeobject(num, genlist[num]); |
} |
} |
|
/* Construct linked list of free object slots */ |
lastfree = 0; |
for (num = 0; num < xref->len; num++) |
{ |
if (!uselist[num]) |
{ |
genlist[num]++; |
ofslist[lastfree] = num; |
lastfree = num; |
} |
} |
|
writexref(); |
} |
|
int main(int argc, char **argv) |
{ |
fz_error error; |
char *infile; |
char *outfile = "out.pdf"; |
char *password = ""; |
int c, num; |
int subset; |
|
while ((c = fz_getopt(argc, argv, "adgp:")) != -1) |
{ |
switch (c) |
{ |
case 'p': password = fz_optarg; break; |
case 'g': dogarbage ++; break; |
case 'd': doexpand ++; break; |
case 'a': doascii ++; break; |
default: usage(); break; |
} |
} |
|
if (argc - fz_optind < 1) |
usage(); |
|
infile = argv[fz_optind++]; |
|
if (argc - fz_optind > 0 && |
(strstr(argv[fz_optind], ".pdf") || strstr(argv[fz_optind], ".PDF"))) |
{ |
outfile = argv[fz_optind++]; |
} |
|
subset = 0; |
if (argc - fz_optind > 0) |
subset = 1; |
|
error = pdf_open_xref(&xref, infile, password); |
if (error) |
die(fz_rethrow(error, "cannot open input file '%s'", infile)); |
|
out = fopen(outfile, "wb"); |
if (!out) |
die(fz_throw("cannot open output file '%s'", outfile)); |
|
fprintf(out, "%%PDF-%d.%d\n", xref->version / 10, xref->version % 10); |
fprintf(out, "%%\316\274\341\277\246\n\n"); |
|
uselist = fz_calloc(xref->len + 1, sizeof(char)); |
ofslist = fz_calloc(xref->len + 1, sizeof(int)); |
genlist = fz_calloc(xref->len + 1, sizeof(int)); |
renumbermap = fz_calloc(xref->len + 1, sizeof(int)); |
|
for (num = 0; num < xref->len; num++) |
{ |
uselist[num] = 0; |
ofslist[num] = 0; |
genlist[num] = 0; |
renumbermap[num] = num; |
} |
|
/* Make sure any objects hidden in compressed streams have been loaded */ |
preloadobjstms(); |
|
/* Only retain the specified subset of the pages */ |
if (subset) |
retainpages(argc, argv); |
|
/* Sweep & mark objects from the trailer */ |
if (dogarbage >= 1) |
sweepobj(xref->trailer); |
|
/* Coalesce and renumber duplicate objects */ |
if (dogarbage >= 3) |
removeduplicateobjs(); |
|
/* Compact xref by renumbering and removing unused objects */ |
if (dogarbage >= 2) |
compactxref(); |
|
/* Make renumbering affect all indirect references and update xref */ |
if (dogarbage >= 2) |
renumberobjs(); |
|
writepdf(); |
|
if (fclose(out)) |
die(fz_throw("cannot close output file '%s'", outfile)); |
|
fz_free(uselist); |
fz_free(ofslist); |
fz_free(genlist); |
fz_free(renumbermap); |
|
pdf_free_xref(xref); |
|
fz_flush_warnings(); |
|
return 0; |
} |