Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4680 | right-hear | 1 | /* |
2 | * pdfextract -- the ultimate way to extract images and fonts from pdfs |
||
3 | */ |
||
4 | |||
5 | #include "fitz.h" |
||
6 | #include "mupdf.h" |
||
7 | |||
8 | static pdf_xref *xref = NULL; |
||
9 | static int dorgb = 0; |
||
10 | |||
11 | void die(fz_error error) |
||
12 | { |
||
13 | fz_catch(error, "aborting"); |
||
14 | if (xref) |
||
15 | pdf_free_xref(xref); |
||
16 | exit(1); |
||
17 | } |
||
18 | |||
19 | static void usage(void) |
||
20 | { |
||
21 | fprintf(stderr, "usage: pdfextract [options] file.pdf [object numbers]\n"); |
||
22 | fprintf(stderr, "\t-p\tpassword\n"); |
||
23 | fprintf(stderr, "\t-r\tconvert images to rgb\n"); |
||
24 | exit(1); |
||
25 | } |
||
26 | |||
27 | static int isimage(fz_obj *obj) |
||
28 | { |
||
29 | fz_obj *type = fz_dict_gets(obj, "Subtype"); |
||
30 | return fz_is_name(type) && !strcmp(fz_to_name(type), "Image"); |
||
31 | } |
||
32 | |||
33 | static int isfontdesc(fz_obj *obj) |
||
34 | { |
||
35 | fz_obj *type = fz_dict_gets(obj, "Type"); |
||
36 | return fz_is_name(type) && !strcmp(fz_to_name(type), "FontDescriptor"); |
||
37 | } |
||
38 | |||
39 | static void saveimage(int num) |
||
40 | { |
||
41 | fz_error error; |
||
42 | fz_pixmap *img; |
||
43 | fz_obj *ref; |
||
44 | char name[1024]; |
||
45 | |||
46 | ref = fz_new_indirect(num, 0, xref); |
||
47 | |||
48 | /* TODO: detect DCTD and save as jpeg */ |
||
49 | |||
50 | error = pdf_load_image(&img, xref, ref); |
||
51 | if (error) |
||
52 | die(error); |
||
53 | |||
54 | if (dorgb && img->colorspace && img->colorspace != fz_device_rgb) |
||
55 | { |
||
56 | fz_pixmap *temp; |
||
57 | temp = fz_new_pixmap_with_rect(fz_device_rgb, fz_bound_pixmap(img)); |
||
58 | fz_convert_pixmap(img, temp); |
||
59 | fz_drop_pixmap(img); |
||
60 | img = temp; |
||
61 | } |
||
62 | |||
63 | if (img->n <= 4) |
||
64 | { |
||
65 | sprintf(name, "img-%04d.png", num); |
||
66 | printf("extracting image %s\n", name); |
||
67 | fz_write_png(img, name, 0); |
||
68 | } |
||
69 | else |
||
70 | { |
||
71 | sprintf(name, "img-%04d.pam", num); |
||
72 | printf("extracting image %s\n", name); |
||
73 | fz_write_pam(img, name, 0); |
||
74 | } |
||
75 | |||
76 | fz_drop_pixmap(img); |
||
77 | fz_drop_obj(ref); |
||
78 | } |
||
79 | |||
80 | static void savefont(fz_obj *dict, int num) |
||
81 | { |
||
82 | fz_error error; |
||
83 | char name[1024]; |
||
84 | char *subtype; |
||
85 | fz_buffer *buf; |
||
86 | fz_obj *stream = NULL; |
||
87 | fz_obj *obj; |
||
88 | char *ext = ""; |
||
89 | FILE *f; |
||
90 | char *fontname = "font"; |
||
91 | int n; |
||
92 | |||
93 | obj = fz_dict_gets(dict, "FontName"); |
||
94 | if (obj) |
||
95 | fontname = fz_to_name(obj); |
||
96 | |||
97 | obj = fz_dict_gets(dict, "FontFile"); |
||
98 | if (obj) |
||
99 | { |
||
100 | stream = obj; |
||
101 | ext = "pfa"; |
||
102 | } |
||
103 | |||
104 | obj = fz_dict_gets(dict, "FontFile2"); |
||
105 | if (obj) |
||
106 | { |
||
107 | stream = obj; |
||
108 | ext = "ttf"; |
||
109 | } |
||
110 | |||
111 | obj = fz_dict_gets(dict, "FontFile3"); |
||
112 | if (obj) |
||
113 | { |
||
114 | stream = obj; |
||
115 | |||
116 | obj = fz_dict_gets(obj, "Subtype"); |
||
117 | if (obj && !fz_is_name(obj)) |
||
118 | die(fz_throw("Invalid font descriptor subtype")); |
||
119 | |||
120 | subtype = fz_to_name(obj); |
||
121 | if (!strcmp(subtype, "Type1C")) |
||
122 | ext = "cff"; |
||
123 | else if (!strcmp(subtype, "CIDFontType0C")) |
||
124 | ext = "cid"; |
||
125 | else |
||
126 | die(fz_throw("Unhandled font type '%s'", subtype)); |
||
127 | } |
||
128 | |||
129 | if (!stream) |
||
130 | { |
||
131 | fz_warn("Unhandled font type"); |
||
132 | return; |
||
133 | } |
||
134 | |||
135 | buf = fz_new_buffer(0); |
||
136 | |||
137 | error = pdf_load_stream(&buf, xref, fz_to_num(stream), fz_to_gen(stream)); |
||
138 | if (error) |
||
139 | die(error); |
||
140 | |||
141 | sprintf(name, "%s-%04d.%s", fontname, num, ext); |
||
142 | printf("extracting font %s\n", name); |
||
143 | |||
144 | f = fopen(name, "wb"); |
||
145 | if (f == NULL) |
||
146 | die(fz_throw("Error creating font file")); |
||
147 | |||
148 | n = fwrite(buf->data, 1, buf->len, f); |
||
149 | if (n < buf->len) |
||
150 | die(fz_throw("Error writing font file")); |
||
151 | |||
152 | if (fclose(f) < 0) |
||
153 | die(fz_throw("Error closing font file")); |
||
154 | |||
155 | fz_drop_buffer(buf); |
||
156 | } |
||
157 | |||
158 | static void showobject(int num) |
||
159 | { |
||
160 | fz_error error; |
||
161 | fz_obj *obj; |
||
162 | |||
163 | if (!xref) |
||
164 | die(fz_throw("no file specified")); |
||
165 | |||
166 | error = pdf_load_object(&obj, xref, num, 0); |
||
167 | if (error) |
||
168 | die(error); |
||
169 | |||
170 | if (isimage(obj)) |
||
171 | saveimage(num); |
||
172 | else if (isfontdesc(obj)) |
||
173 | savefont(obj, num); |
||
174 | |||
175 | fz_drop_obj(obj); |
||
176 | } |
||
177 | |||
178 | int main(int argc, char **argv) |
||
179 | { |
||
180 | fz_error error; |
||
181 | char *infile; |
||
182 | char *password = ""; |
||
183 | int c, o; |
||
184 | |||
185 | while ((c = fz_getopt(argc, argv, "p:r")) != -1) |
||
186 | { |
||
187 | switch (c) |
||
188 | { |
||
189 | case 'p': password = fz_optarg; break; |
||
190 | case 'r': dorgb++; break; |
||
191 | default: usage(); break; |
||
192 | } |
||
193 | } |
||
194 | |||
195 | if (fz_optind == argc) |
||
196 | usage(); |
||
197 | |||
198 | infile = argv[fz_optind++]; |
||
199 | error = pdf_open_xref(&xref, infile, password); |
||
200 | if (error) |
||
201 | die(fz_rethrow(error, "cannot open input file '%s'", infile)); |
||
202 | |||
203 | if (fz_optind == argc) |
||
204 | { |
||
205 | for (o = 0; o < xref->len; o++) |
||
206 | showobject(o); |
||
207 | } |
||
208 | else |
||
209 | { |
||
210 | while (fz_optind < argc) |
||
211 | { |
||
212 | showobject(atoi(argv[fz_optind])); |
||
213 | fz_optind++; |
||
214 | } |
||
215 | } |
||
216 | |||
217 | pdf_free_xref(xref); |
||
218 | |||
219 | fz_flush_warnings(); |
||
220 | |||
221 | return 0; |
||
222 | }>>>>=> |