WebSVN – Kolibri OS – Path Comparison – / – /contrib/media/updf/apps/pdfclean.c Rev 4679 and /contrib/media/updf/apps/pdfclean.c Rev 4680

Regard whitespace Rev 4679 → Rev 4680

 /contrib/media/updf/apps/pdfclean.c
 ,0 → 1,775
+/*
+ * PDF cleaning tool: general purpose pdf syntax washer.
+ *
+ * Rewrite PDF with pretty printed objects.
+ * Garbage collect unreachable objects.
+ * Inflate compressed streams.
+ * Create subset documents.
+ *
+ * TODO: linearize document for fast web view
+ */
+#include "fitz.h"
+#include "mupdf.h"
+static FILE *out = NULL;
+static char *uselist = NULL;
+static int *ofslist = NULL;
+static int *genlist = NULL;
+static int *renumbermap = NULL;
+static int dogarbage = 0;
+static int doexpand = 0;
+static int doascii = 0;
+static pdf_xref *xref = NULL;
+void die(fz_error error)
+{
+        fz_catch(error, "aborting");
+        if (xref)
+                pdf_free_xref(xref);
+        exit(1);
+}
+static void usage(void)
+{
+        fprintf(stderr,
+                "usage: pdfclean [options] input.pdf [output.pdf] [pages]\n"
+                "\t-p -\tpassword\n"
+                "\t-g\tgarbage collect unused objects\n"
+                "\t-gg\tin addition to -g compact xref table\n"
+                "\t-ggg\tin addition to -gg merge duplicate objects\n"
+                "\t-d\tdecompress streams\n"
+                "\t-a\tascii hex encode binary streams\n"
+                "\tpages\tcomma separated list of ranges\n");
+        exit(1);
+}
+/*
+ * Garbage collect objects not reachable from the trailer.
+ */
+static void sweepref(fz_obj *ref);
+static void sweepobj(fz_obj *obj)
+{
+        int i;
+        if (fz_is_indirect(obj))
+                sweepref(obj);
+        else if (fz_is_dict(obj))
+                for (i = 0; i < fz_dict_len(obj); i++)
+                        sweepobj(fz_dict_get_val(obj, i));
+        else if (fz_is_array(obj))
+                for (i = 0; i < fz_array_len(obj); i++)
+                        sweepobj(fz_array_get(obj, i));
+}
+static void sweepref(fz_obj *obj)
+{
+        int num = fz_to_num(obj);
+        int gen = fz_to_gen(obj);
+        if (num < 0 || num >= xref->len)
+                return;
+        if (uselist[num])
+                return;
+        uselist[num] = 1;
+        /* Bake in /Length in stream objects */
+        if (pdf_is_stream(xref, num, gen))
+        {
+                fz_obj *len = fz_dict_gets(obj, "Length");
+                if (fz_is_indirect(len))
+                {
+                        uselist[fz_to_num(len)] = 0;
+                        len = fz_resolve_indirect(len);
+                        fz_dict_puts(obj, "Length", len);
+                }
+        }
+        sweepobj(fz_resolve_indirect(obj));
+}
+/*
+ * Scan for and remove duplicate objects (slow)
+ */
+static void removeduplicateobjs(void)
+{
+        int num, other;
+        for (num = 1; num < xref->len; num++)
+        {
+                /* Only compare an object to objects preceeding it */
+                for (other = 1; other < num; other++)
+                {
+                        fz_obj *a, *b;
+                        if (num == other || !uselist[num] || !uselist[other])
+                                continue;
+                        /*
+                         * Comparing stream objects data contents would take too long.
+                         *
+                         * pdf_is_stream calls pdf_cache_object and ensures
+                         * that the xref table has the objects loaded.
+                         */
+                        if (pdf_is_stream(xref, num, 0) || pdf_is_stream(xref, other, 0))
+                                continue;
+                        a = xref->table[num].obj;
+                        b = xref->table[other].obj;
+                        a = fz_resolve_indirect(a);
+                        b = fz_resolve_indirect(b);
+                        if (fz_objcmp(a, b))
+                                continue;
+                        /* Keep the lowest numbered object */
+                        renumbermap[num] = MIN(num, other);
+                        renumbermap[other] = MIN(num, other);
+                        uselist[MAX(num, other)] = 0;
+                        /* One duplicate was found, do not look for another */
+                        break;
+                }
+        }
+}
+/*
+ * Renumber objects sequentially so the xref is more compact
+ */
+static void compactxref(void)
+{
+        int num, newnum;
+        /*
+         * Update renumbermap in-place, clustering all used
+         * objects together at low object ids. Objects that
+         * already should be renumbered will have their new
+         * object ids be updated to reflect the compaction.
+         */
+        newnum = 1;
+        for (num = 1; num < xref->len; num++)
+        {
+                if (uselist[num] && renumbermap[num] == num)
+                        renumbermap[num] = newnum++;
+                else if (renumbermap[num] != num)
+                        renumbermap[num] = renumbermap[renumbermap[num]];
+        }
+}
+/*
+ * Update indirect objects according to renumbering established when
+ * removing duplicate objects and compacting the xref.
+ */
+static void renumberobj(fz_obj *obj)
+{
+        int i;
+        if (fz_is_dict(obj))
+        {
+                for (i = 0; i < fz_dict_len(obj); i++)
+                {
+                        fz_obj *key = fz_dict_get_key(obj, i);
+                        fz_obj *val = fz_dict_get_val(obj, i);
+                        if (fz_is_indirect(val))
+                        {
+                                val = fz_new_indirect(renumbermap[fz_to_num(val)], 0, xref);
+                                fz_dict_put(obj, key, val);
+                                fz_drop_obj(val);
+                        }
+                        else
+                        {
+                                renumberobj(val);
+                        }
+                }
+        }
+        else if (fz_is_array(obj))
+        {
+                for (i = 0; i < fz_array_len(obj); i++)
+                {
+                        fz_obj *val = fz_array_get(obj, i);
+                        if (fz_is_indirect(val))
+                        {
+                                val = fz_new_indirect(renumbermap[fz_to_num(val)], 0, xref);
+                                fz_array_put(obj, i, val);
+                                fz_drop_obj(val);
+                        }
+                        else
+                        {
+                                renumberobj(val);
+                        }
+                }
+        }
+}
+static void renumberobjs(void)
+{
+        pdf_xref_entry *oldxref;
+        int newlen;
+        int num;
+        /* Apply renumber map to indirect references in all objects in xref */
+        renumberobj(xref->trailer);
+        for (num = 0; num < xref->len; num++)
+        {
+                fz_obj *obj = xref->table[num].obj;
+                if (fz_is_indirect(obj))
+                {
+                        obj = fz_new_indirect(renumbermap[fz_to_num(obj)], 0, xref);
+                        pdf_update_object(xref, num, 0, obj);
+                        fz_drop_obj(obj);
+                }
+                else
+                {
+                        renumberobj(obj);
+                }
+        }
+        /* Create new table for the reordered, compacted xref */
+        oldxref = xref->table;
+        xref->table = fz_calloc(xref->len, sizeof(pdf_xref_entry));
+        xref->table[0] = oldxref[0];
+        /* Move used objects into the new compacted xref */
+        newlen = 0;
+        for (num = 1; num < xref->len; num++)
+        {
+                if (uselist[num])
+                {
+                        if (newlen < renumbermap[num])
+                                newlen = renumbermap[num];
+                        xref->table[renumbermap[num]] = oldxref[num];
+                }
+                else
+                {
+                        if (oldxref[num].obj)
+                                fz_drop_obj(oldxref[num].obj);
+                }
+        }
+        fz_free(oldxref);
+        /* Update the used objects count in compacted xref */
+        xref->len = newlen + 1;
+        /* Update list of used objects to fit with compacted xref */
+        for (num = 1; num < xref->len; num++)
+                uselist[num] = 1;
+}
+/*
+ * Recreate page tree to only retain specified pages.
+ */
+static void retainpages(int argc, char **argv)
+{
+        fz_error error;
+        fz_obj *oldroot, *root, *pages, *kids, *countobj, *parent;
+        /* Load the old page tree */
+        error = pdf_load_page_tree(xref);
+        if (error)
+                die(fz_rethrow(error, "cannot load page tree"));
+        /* Keep only pages/type entry to avoid references to unretained pages */
+        oldroot = fz_dict_gets(xref->trailer, "Root");
+        pages = fz_dict_gets(oldroot, "Pages");
+        root = fz_new_dict(2);
+        fz_dict_puts(root, "Type", fz_dict_gets(oldroot, "Type"));
+        fz_dict_puts(root, "Pages", fz_dict_gets(oldroot, "Pages"));
+        pdf_update_object(xref, fz_to_num(oldroot), fz_to_gen(oldroot), root);
+        fz_drop_obj(root);
+        /* Create a new kids array with only the pages we want to keep */
+        parent = fz_new_indirect(fz_to_num(pages), fz_to_gen(pages), xref);
+        kids = fz_new_array(1);
+        /* Retain pages specified */
+        while (argc - fz_optind)
+        {
+                int page, spage, epage;
+                char *spec, *dash;
+                char *pagelist = argv[fz_optind];
+                spec = fz_strsep(&pagelist, ",");
+                while (spec)
+                {
+                        dash = strchr(spec, '-');
+                        if (dash == spec)
+                                spage = epage = pdf_count_pages(xref);
+                        else
+                                spage = epage = atoi(spec);
+                        if (dash)
+                        {
+                                if (strlen(dash) > 1)
+                                        epage = atoi(dash + 1);
+                                else
+                                        epage = pdf_count_pages(xref);
+                        }
+                        if (spage > epage)
+                                page = spage, spage = epage, epage = page;
+                        if (spage < 1)
+                                spage = 1;
+                        if (epage > pdf_count_pages(xref))
+                                epage = pdf_count_pages(xref);
+                        for (page = spage; page <= epage; page++)
+                        {
+                                fz_obj *pageobj = xref->page_objs[page-1];
+                                fz_obj *pageref = xref->page_refs[page-1];
+                                fz_dict_puts(pageobj, "Parent", parent);
+                                /* Store page object in new kids array */
+                                fz_array_push(kids, pageref);
+                        }
+                        spec = fz_strsep(&pagelist, ",");
+                }
+                fz_optind++;
+        }
+        fz_drop_obj(parent);
+        /* Update page count and kids array */
+        countobj = fz_new_int(fz_array_len(kids));
+        fz_dict_puts(pages, "Count", countobj);
+        fz_drop_obj(countobj);
+        fz_dict_puts(pages, "Kids", kids);
+        fz_drop_obj(kids);
+}
+/*
+ * Make sure we have loaded objects from object streams.
+ */
+static void preloadobjstms(void)
+{
+        fz_error error;
+        fz_obj *obj;
+        int num;
+        for (num = 0; num < xref->len; num++)
+        {
+                if (xref->table[num].type == 'o')
+                {
+                        error = pdf_load_object(&obj, xref, num, 0);
+                        if (error)
+                                die(error);
+                        fz_drop_obj(obj);
+                }
+        }
+}
+/*
+ * Save streams and objects to the output
+ */
+static inline int isbinary(int c)
+{
+        if (c == '\n' || c == '\r' || c == '\t')
+                return 0;
+        return c < 32 || c > 127;
+}
+static int isbinarystream(fz_buffer *buf)
+{
+        int i;
+        for (i = 0; i < buf->len; i++)
+                if (isbinary(buf->data[i]))
+                        return 1;
+        return 0;
+}
+static fz_buffer *hexbuf(unsigned char *p, int n)
+{
+        static const char hex[16] = "0123456789abcdef";
+        fz_buffer *buf;
+        int x = 0;
+        buf = fz_new_buffer(n * 2 + (n / 32) + 2);
+        while (n--)
+        {
+                buf->data[buf->len++] = hex[*p >> 4];
+                buf->data[buf->len++] = hex[*p & 15];
+                if (++x == 32)
+                {
+                        buf->data[buf->len++] = '\n';
+                        x = 0;
+                }
+                p++;
+        }
+        buf->data[buf->len++] = '>';
+        buf->data[buf->len++] = '\n';
+        return buf;
+}
+static void addhexfilter(fz_obj *dict)
+{
+        fz_obj *f, *dp, *newf, *newdp;
+        fz_obj *ahx, *nullobj;
+        ahx = fz_new_name("ASCIIHexDecode");
+        nullobj = fz_new_null();
+        newf = newdp = NULL;
+        f = fz_dict_gets(dict, "Filter");
+        dp = fz_dict_gets(dict, "DecodeParms");
+        if (fz_is_name(f))
+        {
+                newf = fz_new_array(2);
+                fz_array_push(newf, ahx);
+                fz_array_push(newf, f);
+                f = newf;
+                if (fz_is_dict(dp))
+                {
+                        newdp = fz_new_array(2);
+                        fz_array_push(newdp, nullobj);
+                        fz_array_push(newdp, dp);
+                        dp = newdp;
+                }
+        }
+        else if (fz_is_array(f))
+        {
+                fz_array_insert(f, ahx);
+                if (fz_is_array(dp))
+                        fz_array_insert(dp, nullobj);
+        }
+        else
+                f = ahx;
+        fz_dict_puts(dict, "Filter", f);
+        if (dp)
+                fz_dict_puts(dict, "DecodeParms", dp);
+        fz_drop_obj(ahx);
+        fz_drop_obj(nullobj);
+        if (newf)
+                fz_drop_obj(newf);
+        if (newdp)
+                fz_drop_obj(newdp);
+}
+static void copystream(fz_obj *obj, int num, int gen)
+{
+        fz_error error;
+        fz_buffer *buf, *tmp;
+        fz_obj *newlen;
+        error = pdf_load_raw_stream(&buf, xref, num, gen);
+        if (error)
+                die(error);
+        if (doascii && isbinarystream(buf))
+        {
+                tmp = hexbuf(buf->data, buf->len);
+                fz_drop_buffer(buf);
+                buf = tmp;
+                addhexfilter(obj);
+                newlen = fz_new_int(buf->len);
+                fz_dict_puts(obj, "Length", newlen);
+                fz_drop_obj(newlen);
+        }
+        fprintf(out, "%d %d obj\n", num, gen);
+        fz_fprint_obj(out, obj, !doexpand);
+        fprintf(out, "stream\n");
+        fwrite(buf->data, 1, buf->len, out);
+        fprintf(out, "endstream\nendobj\n\n");
+        fz_drop_buffer(buf);
+}
+static void expandstream(fz_obj *obj, int num, int gen)
+{
+        fz_error error;
+        fz_buffer *buf, *tmp;
+        fz_obj *newlen;
+        error = pdf_load_stream(&buf, xref, num, gen);
+        if (error)
+                die(error);
+        fz_dict_dels(obj, "Filter");
+        fz_dict_dels(obj, "DecodeParms");
+        if (doascii && isbinarystream(buf))
+        {
+                tmp = hexbuf(buf->data, buf->len);
+                fz_drop_buffer(buf);
+                buf = tmp;
+                addhexfilter(obj);
+        }
+        newlen = fz_new_int(buf->len);
+        fz_dict_puts(obj, "Length", newlen);
+        fz_drop_obj(newlen);
+        fprintf(out, "%d %d obj\n", num, gen);
+        fz_fprint_obj(out, obj, !doexpand);
+        fprintf(out, "stream\n");
+        fwrite(buf->data, 1, buf->len, out);
+        fprintf(out, "endstream\nendobj\n\n");
+        fz_drop_buffer(buf);
+}
+static void writeobject(int num, int gen)
+{
+        fz_error error;
+        fz_obj *obj;
+        fz_obj *type;
+        error = pdf_load_object(&obj, xref, num, gen);
+        if (error)
+                die(error);
+        /* skip ObjStm and XRef objects */
+        if (fz_is_dict(obj))
+        {
+                type = fz_dict_gets(obj, "Type");
+                if (fz_is_name(type) && !strcmp(fz_to_name(type), "ObjStm"))
+                {
+                        uselist[num] = 0;
+                        fz_drop_obj(obj);
+                        return;
+                }
+                if (fz_is_name(type) && !strcmp(fz_to_name(type), "XRef"))
+                {
+                        uselist[num] = 0;
+                        fz_drop_obj(obj);
+                        return;
+                }
+        }
+        if (!pdf_is_stream(xref, num, gen))
+        {
+                fprintf(out, "%d %d obj\n", num, gen);
+                fz_fprint_obj(out, obj, !doexpand);
+                fprintf(out, "endobj\n\n");
+        }
+        else
+        {
+                if (doexpand && !pdf_is_jpx_image(obj))
+                        expandstream(obj, num, gen);
+                else
+                        copystream(obj, num, gen);
+        }
+        fz_drop_obj(obj);
+}
+static void writexref(void)
+{
+        fz_obj *trailer;
+        fz_obj *obj;
+        int startxref;
+        int num;
+        startxref = ftell(out);
+        fprintf(out, "xref\n0 %d\n", xref->len);
+        for (num = 0; num < xref->len; num++)
+        {
+                if (uselist[num])
+                        fprintf(out, "%010d %05d n \n", ofslist[num], genlist[num]);
+                else
+                        fprintf(out, "%010d %05d f \n", ofslist[num], genlist[num]);
+        }
+        fprintf(out, "\n");
+        trailer = fz_new_dict(5);
+        obj = fz_new_int(xref->len);
+        fz_dict_puts(trailer, "Size", obj);
+        fz_drop_obj(obj);
+        obj = fz_dict_gets(xref->trailer, "Info");
+        if (obj)
+                fz_dict_puts(trailer, "Info", obj);
+        obj = fz_dict_gets(xref->trailer, "Root");
+        if (obj)
+                fz_dict_puts(trailer, "Root", obj);
+        obj = fz_dict_gets(xref->trailer, "ID");
+        if (obj)
+                fz_dict_puts(trailer, "ID", obj);
+        fprintf(out, "trailer\n");
+        fz_fprint_obj(out, trailer, !doexpand);
+        fprintf(out, "\n");
+        fz_drop_obj(trailer);
+        fprintf(out, "startxref\n%d\n%%%%EOF\n", startxref);
+}
+static void writepdf(void)
+{
+        int lastfree;
+        int num;
+        for (num = 0; num < xref->len; num++)
+        {
+                if (xref->table[num].type == 'f')
+                        genlist[num] = xref->table[num].gen;
+                if (xref->table[num].type == 'n')
+                        genlist[num] = xref->table[num].gen;
+                if (xref->table[num].type == 'o')
+                        genlist[num] = 0;
+                if (dogarbage && !uselist[num])
+                        continue;
+                if (xref->table[num].type == 'n' || xref->table[num].type == 'o')
+                {
+                        uselist[num] = 1;
+                        ofslist[num] = ftell(out);
+                        writeobject(num, genlist[num]);
+                }
+        }
+        /* Construct linked list of free object slots */
+        lastfree = 0;
+        for (num = 0; num < xref->len; num++)
+        {
+                if (!uselist[num])
+                {
+                        genlist[num]++;
+                        ofslist[lastfree] = num;
+                        lastfree = num;
+                }
+        }
+        writexref();
+}
+int main(int argc, char **argv)
+{
+        fz_error error;
+        char *infile;
+        char *outfile = "out.pdf";
+        char *password = "";
+        int c, num;
+        int subset;
+        while ((c = fz_getopt(argc, argv, "adgp:")) != -1)
+        {
+                switch (c)
+                {
+                case 'p': password = fz_optarg; break;
+                case 'g': dogarbage ++; break;
+                case 'd': doexpand ++; break;
+                case 'a': doascii ++; break;
+                default: usage(); break;
+                }
+        }
+        if (argc - fz_optind < 1)
+                usage();
+        infile = argv[fz_optind++];
+        if (argc - fz_optind > 0 &&
+                (strstr(argv[fz_optind], ".pdf") || strstr(argv[fz_optind], ".PDF")))
+        {
+                outfile = argv[fz_optind++];
+        }
+        subset = 0;
+        if (argc - fz_optind > 0)
+                subset = 1;
+        error = pdf_open_xref(&xref, infile, password);
+        if (error)
+                die(fz_rethrow(error, "cannot open input file '%s'", infile));
+        out = fopen(outfile, "wb");
+        if (!out)
+                die(fz_throw("cannot open output file '%s'", outfile));
+        fprintf(out, "%%PDF-%d.%d\n", xref->version / 10, xref->version % 10);
+        fprintf(out, "%%\316\274\341\277\246\n\n");
+        uselist = fz_calloc(xref->len + 1, sizeof(char));
+        ofslist = fz_calloc(xref->len + 1, sizeof(int));
+        genlist = fz_calloc(xref->len + 1, sizeof(int));
+        renumbermap = fz_calloc(xref->len + 1, sizeof(int));
+        for (num = 0; num < xref->len; num++)
+        {
+                uselist[num] = 0;
+                ofslist[num] = 0;
+                genlist[num] = 0;
+                renumbermap[num] = num;
+        }
+        /* Make sure any objects hidden in compressed streams have been loaded */
+        preloadobjstms();
+        /* Only retain the specified subset of the pages */
+        if (subset)
+                retainpages(argc, argv);
+        /* Sweep & mark objects from the trailer */
+        if (dogarbage >= 1)
+                sweepobj(xref->trailer);
+        /* Coalesce and renumber duplicate objects */
+        if (dogarbage >= 3)
+                removeduplicateobjs();
+        /* Compact xref by renumbering and removing unused objects */
+        if (dogarbage >= 2)
+                compactxref();
+        /* Make renumbering affect all indirect references and update xref */
+        if (dogarbage >= 2)
+                renumberobjs();
+        writepdf();
+        if (fclose(out))
+                die(fz_throw("cannot close output file '%s'", outfile));
+        fz_free(uselist);
+        fz_free(ofslist);
+        fz_free(genlist);
+        fz_free(renumbermap);
+        pdf_free_xref(xref);
+        fz_flush_warnings();
+        return 0;
+}

Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 4679 → Rev 4680