0,0 → 1,222 |
/* |
* pdfextract -- the ultimate way to extract images and fonts from pdfs |
*/ |
|
#include "fitz.h" |
#include "mupdf.h" |
|
static pdf_xref *xref = NULL; |
static int dorgb = 0; |
|
void die(fz_error error) |
{ |
fz_catch(error, "aborting"); |
if (xref) |
pdf_free_xref(xref); |
exit(1); |
} |
|
static void usage(void) |
{ |
fprintf(stderr, "usage: pdfextract [options] file.pdf [object numbers]\n"); |
fprintf(stderr, "\t-p\tpassword\n"); |
fprintf(stderr, "\t-r\tconvert images to rgb\n"); |
exit(1); |
} |
|
static int isimage(fz_obj *obj) |
{ |
fz_obj *type = fz_dict_gets(obj, "Subtype"); |
return fz_is_name(type) && !strcmp(fz_to_name(type), "Image"); |
} |
|
static int isfontdesc(fz_obj *obj) |
{ |
fz_obj *type = fz_dict_gets(obj, "Type"); |
return fz_is_name(type) && !strcmp(fz_to_name(type), "FontDescriptor"); |
} |
|
static void saveimage(int num) |
{ |
fz_error error; |
fz_pixmap *img; |
fz_obj *ref; |
char name[1024]; |
|
ref = fz_new_indirect(num, 0, xref); |
|
/* TODO: detect DCTD and save as jpeg */ |
|
error = pdf_load_image(&img, xref, ref); |
if (error) |
die(error); |
|
if (dorgb && img->colorspace && img->colorspace != fz_device_rgb) |
{ |
fz_pixmap *temp; |
temp = fz_new_pixmap_with_rect(fz_device_rgb, fz_bound_pixmap(img)); |
fz_convert_pixmap(img, temp); |
fz_drop_pixmap(img); |
img = temp; |
} |
|
if (img->n <= 4) |
{ |
sprintf(name, "img-%04d.png", num); |
printf("extracting image %s\n", name); |
fz_write_png(img, name, 0); |
} |
else |
{ |
sprintf(name, "img-%04d.pam", num); |
printf("extracting image %s\n", name); |
fz_write_pam(img, name, 0); |
} |
|
fz_drop_pixmap(img); |
fz_drop_obj(ref); |
} |
|
static void savefont(fz_obj *dict, int num) |
{ |
fz_error error; |
char name[1024]; |
char *subtype; |
fz_buffer *buf; |
fz_obj *stream = NULL; |
fz_obj *obj; |
char *ext = ""; |
FILE *f; |
char *fontname = "font"; |
int n; |
|
obj = fz_dict_gets(dict, "FontName"); |
if (obj) |
fontname = fz_to_name(obj); |
|
obj = fz_dict_gets(dict, "FontFile"); |
if (obj) |
{ |
stream = obj; |
ext = "pfa"; |
} |
|
obj = fz_dict_gets(dict, "FontFile2"); |
if (obj) |
{ |
stream = obj; |
ext = "ttf"; |
} |
|
obj = fz_dict_gets(dict, "FontFile3"); |
if (obj) |
{ |
stream = obj; |
|
obj = fz_dict_gets(obj, "Subtype"); |
if (obj && !fz_is_name(obj)) |
die(fz_throw("Invalid font descriptor subtype")); |
|
subtype = fz_to_name(obj); |
if (!strcmp(subtype, "Type1C")) |
ext = "cff"; |
else if (!strcmp(subtype, "CIDFontType0C")) |
ext = "cid"; |
else |
die(fz_throw("Unhandled font type '%s'", subtype)); |
} |
|
if (!stream) |
{ |
fz_warn("Unhandled font type"); |
return; |
} |
|
buf = fz_new_buffer(0); |
|
error = pdf_load_stream(&buf, xref, fz_to_num(stream), fz_to_gen(stream)); |
if (error) |
die(error); |
|
sprintf(name, "%s-%04d.%s", fontname, num, ext); |
printf("extracting font %s\n", name); |
|
f = fopen(name, "wb"); |
if (f == NULL) |
die(fz_throw("Error creating font file")); |
|
n = fwrite(buf->data, 1, buf->len, f); |
if (n < buf->len) |
die(fz_throw("Error writing font file")); |
|
if (fclose(f) < 0) |
die(fz_throw("Error closing font file")); |
|
fz_drop_buffer(buf); |
} |
|
static void showobject(int num) |
{ |
fz_error error; |
fz_obj *obj; |
|
if (!xref) |
die(fz_throw("no file specified")); |
|
error = pdf_load_object(&obj, xref, num, 0); |
if (error) |
die(error); |
|
if (isimage(obj)) |
saveimage(num); |
else if (isfontdesc(obj)) |
savefont(obj, num); |
|
fz_drop_obj(obj); |
} |
|
int main(int argc, char **argv) |
{ |
fz_error error; |
char *infile; |
char *password = ""; |
int c, o; |
|
while ((c = fz_getopt(argc, argv, "p:r")) != -1) |
{ |
switch (c) |
{ |
case 'p': password = fz_optarg; break; |
case 'r': dorgb++; break; |
default: usage(); break; |
} |
} |
|
if (fz_optind == argc) |
usage(); |
|
infile = argv[fz_optind++]; |
error = pdf_open_xref(&xref, infile, password); |
if (error) |
die(fz_rethrow(error, "cannot open input file '%s'", infile)); |
|
if (fz_optind == argc) |
{ |
for (o = 0; o < xref->len; o++) |
showobject(o); |
} |
else |
{ |
while (fz_optind < argc) |
{ |
showobject(atoi(argv[fz_optind])); |
fz_optind++; |
} |
} |
|
pdf_free_xref(xref); |
|
fz_flush_warnings(); |
|
return 0; |
} |