0,0 → 1,463 |
#include "fitz.h" |
#include "mupdf.h" |
|
/* Scan file for objects and reconstruct xref table */ |
|
struct entry |
{ |
int num; |
int gen; |
int ofs; |
int stm_ofs; |
int stm_len; |
}; |
|
static fz_error |
pdf_repair_obj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp, fz_obj **encrypt, fz_obj **id) |
{ |
fz_error error; |
int tok; |
int stm_len; |
int len; |
int n; |
|
*stmofsp = 0; |
*stmlenp = -1; |
|
stm_len = 0; |
|
error = pdf_lex(&tok, file, buf, cap, &len); |
if (error) |
return fz_rethrow(error, "cannot parse object"); |
if (tok == PDF_TOK_OPEN_DICT) |
{ |
fz_obj *dict, *obj; |
|
/* Send NULL xref so we don't try to resolve references */ |
error = pdf_parse_dict(&dict, NULL, file, buf, cap); |
if (error) |
return fz_rethrow(error, "cannot parse object"); |
|
obj = fz_dict_gets(dict, "Type"); |
if (fz_is_name(obj) && !strcmp(fz_to_name(obj), "XRef")) |
{ |
obj = fz_dict_gets(dict, "Encrypt"); |
if (obj) |
{ |
if (*encrypt) |
fz_drop_obj(*encrypt); |
*encrypt = fz_keep_obj(obj); |
} |
|
obj = fz_dict_gets(dict, "ID"); |
if (obj) |
{ |
if (*id) |
fz_drop_obj(*id); |
*id = fz_keep_obj(obj); |
} |
} |
|
obj = fz_dict_gets(dict, "Length"); |
if (fz_is_int(obj)) |
stm_len = fz_to_int(obj); |
|
fz_drop_obj(dict); |
} |
|
while ( tok != PDF_TOK_STREAM && |
tok != PDF_TOK_ENDOBJ && |
tok != PDF_TOK_ERROR && |
tok != PDF_TOK_EOF ) |
{ |
error = pdf_lex(&tok, file, buf, cap, &len); |
if (error) |
return fz_rethrow(error, "cannot scan for endobj or stream token"); |
} |
|
if (tok == PDF_TOK_STREAM) |
{ |
int c = fz_read_byte(file); |
if (c == '\r') { |
c = fz_peek_byte(file); |
if (c == '\n') |
fz_read_byte(file); |
} |
|
*stmofsp = fz_tell(file); |
if (*stmofsp < 0) |
return fz_throw("cannot seek in file"); |
|
if (stm_len > 0) |
{ |
fz_seek(file, *stmofsp + stm_len, 0); |
error = pdf_lex(&tok, file, buf, cap, &len); |
if (error) |
fz_catch(error, "cannot find endstream token, falling back to scanning"); |
if (tok == PDF_TOK_ENDSTREAM) |
goto atobjend; |
fz_seek(file, *stmofsp, 0); |
} |
|
n = fz_read(file, (unsigned char *) buf, 9); |
if (n < 0) |
return fz_rethrow(n, "cannot read from file"); |
|
while (memcmp(buf, "endstream", 9) != 0) |
{ |
c = fz_read_byte(file); |
if (c == EOF) |
break; |
memmove(buf, buf + 1, 8); |
buf[8] = c; |
} |
|
*stmlenp = fz_tell(file) - *stmofsp - 9; |
|
atobjend: |
error = pdf_lex(&tok, file, buf, cap, &len); |
if (error) |
return fz_rethrow(error, "cannot scan for endobj token"); |
if (tok != PDF_TOK_ENDOBJ) |
fz_warn("object missing 'endobj' token"); |
} |
|
return fz_okay; |
} |
|
static fz_error |
pdf_repair_obj_stm(pdf_xref *xref, int num, int gen) |
{ |
fz_error error; |
fz_obj *obj; |
fz_stream *stm; |
int tok; |
int i, n, count; |
char buf[256]; |
|
error = pdf_load_object(&obj, xref, num, gen); |
if (error) |
return fz_rethrow(error, "cannot load object stream object (%d %d R)", num, gen); |
|
count = fz_to_int(fz_dict_gets(obj, "N")); |
|
fz_drop_obj(obj); |
|
error = pdf_open_stream(&stm, xref, num, gen); |
if (error) |
return fz_rethrow(error, "cannot open object stream object (%d %d R)", num, gen); |
|
for (i = 0; i < count; i++) |
{ |
error = pdf_lex(&tok, stm, buf, sizeof buf, &n); |
if (error || tok != PDF_TOK_INT) |
{ |
fz_close(stm); |
return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen); |
} |
|
n = atoi(buf); |
if (n >= xref->len) |
pdf_resize_xref(xref, n + 1); |
|
xref->table[n].ofs = num; |
xref->table[n].gen = i; |
xref->table[n].stm_ofs = 0; |
xref->table[n].obj = NULL; |
xref->table[n].type = 'o'; |
|
error = pdf_lex(&tok, stm, buf, sizeof buf, &n); |
if (error || tok != PDF_TOK_INT) |
{ |
fz_close(stm); |
return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen); |
} |
} |
|
fz_close(stm); |
return fz_okay; |
} |
|
fz_error |
pdf_repair_xref(pdf_xref *xref, char *buf, int bufsize) |
{ |
fz_error error; |
fz_obj *dict, *obj; |
fz_obj *length; |
|
fz_obj *encrypt = NULL; |
fz_obj *id = NULL; |
fz_obj *root = NULL; |
fz_obj *info = NULL; |
|
struct entry *list = NULL; |
int listlen; |
int listcap; |
int maxnum = 0; |
|
int num = 0; |
int gen = 0; |
int tmpofs, numofs = 0, genofs = 0; |
int stm_len, stm_ofs = 0; |
int tok; |
int next; |
int i, n, c; |
|
fz_seek(xref->file, 0, 0); |
|
listlen = 0; |
listcap = 1024; |
list = fz_calloc(listcap, sizeof(struct entry)); |
|
/* look for '%PDF' version marker within first kilobyte of file */ |
n = fz_read(xref->file, (unsigned char *)buf, MAX(bufsize, 1024)); |
if (n < 0) |
{ |
error = fz_rethrow(n, "cannot read from file"); |
goto cleanup; |
} |
|
fz_seek(xref->file, 0, 0); |
for (i = 0; i < n - 4; i++) |
{ |
if (memcmp(buf + i, "%PDF", 4) == 0) |
{ |
fz_seek(xref->file, i + 8, 0); /* skip "%PDF-X.Y" */ |
break; |
} |
} |
|
/* skip comment line after version marker since some generators |
* forget to terminate the comment with a newline */ |
c = fz_read_byte(xref->file); |
while (c >= 0 && (c == ' ' || c == '%')) |
c = fz_read_byte(xref->file); |
fz_unread_byte(xref->file); |
|
while (1) |
{ |
tmpofs = fz_tell(xref->file); |
if (tmpofs < 0) |
{ |
error = fz_throw("cannot tell in file"); |
goto cleanup; |
} |
|
error = pdf_lex(&tok, xref->file, buf, bufsize, &n); |
if (error) |
{ |
fz_catch(error, "ignoring the rest of the file"); |
break; |
} |
|
if (tok == PDF_TOK_INT) |
{ |
numofs = genofs; |
num = gen; |
genofs = tmpofs; |
gen = atoi(buf); |
} |
|
else if (tok == PDF_TOK_OBJ) |
{ |
error = pdf_repair_obj(xref->file, buf, bufsize, &stm_ofs, &stm_len, &encrypt, &id); |
if (error) |
{ |
error = fz_rethrow(error, "cannot parse object (%d %d R)", num, gen); |
goto cleanup; |
} |
|
if (listlen + 1 == listcap) |
{ |
listcap = (listcap * 3) / 2; |
list = fz_realloc(list, listcap, sizeof(struct entry)); |
} |
|
list[listlen].num = num; |
list[listlen].gen = gen; |
list[listlen].ofs = numofs; |
list[listlen].stm_ofs = stm_ofs; |
list[listlen].stm_len = stm_len; |
listlen ++; |
|
if (num > maxnum) |
maxnum = num; |
} |
|
/* trailer dictionary */ |
else if (tok == PDF_TOK_OPEN_DICT) |
{ |
error = pdf_parse_dict(&dict, xref, xref->file, buf, bufsize); |
if (error) |
{ |
error = fz_rethrow(error, "cannot parse object"); |
goto cleanup; |
} |
|
obj = fz_dict_gets(dict, "Encrypt"); |
if (obj) |
{ |
if (encrypt) |
fz_drop_obj(encrypt); |
encrypt = fz_keep_obj(obj); |
} |
|
obj = fz_dict_gets(dict, "ID"); |
if (obj) |
{ |
if (id) |
fz_drop_obj(id); |
id = fz_keep_obj(obj); |
} |
|
obj = fz_dict_gets(dict, "Root"); |
if (obj) |
{ |
if (root) |
fz_drop_obj(root); |
root = fz_keep_obj(obj); |
} |
|
obj = fz_dict_gets(dict, "Info"); |
if (obj) |
{ |
if (info) |
fz_drop_obj(info); |
info = fz_keep_obj(obj); |
} |
|
fz_drop_obj(dict); |
} |
|
else if (tok == PDF_TOK_ERROR) |
fz_read_byte(xref->file); |
|
else if (tok == PDF_TOK_EOF) |
break; |
} |
|
/* make xref reasonable */ |
|
pdf_resize_xref(xref, maxnum + 1); |
|
for (i = 0; i < listlen; i++) |
{ |
xref->table[list[i].num].type = 'n'; |
xref->table[list[i].num].ofs = list[i].ofs; |
xref->table[list[i].num].gen = list[i].gen; |
|
xref->table[list[i].num].stm_ofs = list[i].stm_ofs; |
|
/* corrected stream length */ |
if (list[i].stm_len >= 0) |
{ |
error = pdf_load_object(&dict, xref, list[i].num, list[i].gen); |
if (error) |
{ |
error = fz_rethrow(error, "cannot load stream object (%d %d R)", list[i].num, list[i].gen); |
goto cleanup; |
} |
|
length = fz_new_int(list[i].stm_len); |
fz_dict_puts(dict, "Length", length); |
fz_drop_obj(length); |
|
fz_drop_obj(dict); |
} |
|
} |
|
xref->table[0].type = 'f'; |
xref->table[0].ofs = 0; |
xref->table[0].gen = 65535; |
xref->table[0].stm_ofs = 0; |
xref->table[0].obj = NULL; |
|
next = 0; |
for (i = xref->len - 1; i >= 0; i--) |
{ |
if (xref->table[i].type == 'f') |
{ |
xref->table[i].ofs = next; |
if (xref->table[i].gen < 65535) |
xref->table[i].gen ++; |
next = i; |
} |
} |
|
/* create a repaired trailer, Root will be added later */ |
|
xref->trailer = fz_new_dict(5); |
|
obj = fz_new_int(maxnum + 1); |
fz_dict_puts(xref->trailer, "Size", obj); |
fz_drop_obj(obj); |
|
if (root) |
{ |
fz_dict_puts(xref->trailer, "Root", root); |
fz_drop_obj(root); |
} |
if (info) |
{ |
fz_dict_puts(xref->trailer, "Info", info); |
fz_drop_obj(info); |
} |
|
if (encrypt) |
{ |
if (fz_is_indirect(encrypt)) |
{ |
/* create new reference with non-NULL xref pointer */ |
obj = fz_new_indirect(fz_to_num(encrypt), fz_to_gen(encrypt), xref); |
fz_drop_obj(encrypt); |
encrypt = obj; |
} |
fz_dict_puts(xref->trailer, "Encrypt", encrypt); |
fz_drop_obj(encrypt); |
} |
|
if (id) |
{ |
if (fz_is_indirect(id)) |
{ |
/* create new reference with non-NULL xref pointer */ |
obj = fz_new_indirect(fz_to_num(id), fz_to_gen(id), xref); |
fz_drop_obj(id); |
id = obj; |
} |
fz_dict_puts(xref->trailer, "ID", id); |
fz_drop_obj(id); |
} |
|
fz_free(list); |
return fz_okay; |
|
cleanup: |
if (encrypt) fz_drop_obj(encrypt); |
if (id) fz_drop_obj(id); |
if (root) fz_drop_obj(root); |
if (info) fz_drop_obj(info); |
fz_free(list); |
return error; /* already rethrown */ |
} |
|
fz_error |
pdf_repair_obj_stms(pdf_xref *xref) |
{ |
fz_obj *dict; |
int i; |
|
for (i = 0; i < xref->len; i++) |
{ |
if (xref->table[i].stm_ofs) |
{ |
pdf_load_object(&dict, xref, i, 0); |
if (!strcmp(fz_to_name(fz_dict_gets(dict, "Type")), "ObjStm")) |
pdf_repair_obj_stm(xref, i, 0); |
fz_drop_obj(dict); |
} |
} |
|
return fz_okay; |
} |