Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4680 | right-hear | 1 | /* |
2 | * The CMap data structure here is constructed on the fly by |
||
3 | * adding simple range-to-range mappings. Then the data structure |
||
4 | * is optimized to contain both range-to-range and range-to-table |
||
5 | * lookups. |
||
6 | * |
||
7 | * Any one-to-many mappings are inserted as one-to-table |
||
8 | * lookups in the beginning, and are not affected by the optimization |
||
9 | * stage. |
||
10 | * |
||
11 | * There is a special function to add a 256-length range-to-table mapping. |
||
12 | * The ranges do not have to be added in order. |
||
13 | * |
||
14 | * This code can be a lot simpler if we don't care about wasting memory, |
||
15 | * or can trust the parser to give us optimal mappings. |
||
16 | */ |
||
17 | |||
18 | #include "fitz.h" |
||
19 | #include "mupdf.h" |
||
20 | |||
21 | /* Macros for accessing the combined extent_flags field */ |
||
22 | #define pdf_range_high(r) ((r)->low + ((r)->extent_flags >> 2)) |
||
23 | #define pdf_range_flags(r) ((r)->extent_flags & 3) |
||
24 | #define pdf_range_set_high(r, h) \ |
||
25 | ((r)->extent_flags = (((r)->extent_flags & 3) | ((h - (r)->low) << 2))) |
||
26 | #define pdf_range_set_flags(r, f) \ |
||
27 | ((r)->extent_flags = (((r)->extent_flags & ~3) | f)) |
||
28 | |||
29 | /* |
||
30 | * Allocate, destroy and simple parameters. |
||
31 | */ |
||
32 | |||
33 | pdf_cmap * |
||
34 | pdf_new_cmap(void) |
||
35 | { |
||
36 | pdf_cmap *cmap; |
||
37 | |||
38 | cmap = fz_malloc(sizeof(pdf_cmap)); |
||
39 | cmap->refs = 1; |
||
40 | |||
41 | strcpy(cmap->cmap_name, ""); |
||
42 | strcpy(cmap->usecmap_name, ""); |
||
43 | cmap->usecmap = NULL; |
||
44 | cmap->wmode = 0; |
||
45 | cmap->codespace_len = 0; |
||
46 | |||
47 | cmap->rlen = 0; |
||
48 | cmap->rcap = 0; |
||
49 | cmap->ranges = NULL; |
||
50 | |||
51 | cmap->tlen = 0; |
||
52 | cmap->tcap = 0; |
||
53 | cmap->table = NULL; |
||
54 | |||
55 | return cmap; |
||
56 | } |
||
57 | |||
58 | pdf_cmap * |
||
59 | pdf_keep_cmap(pdf_cmap *cmap) |
||
60 | { |
||
61 | if (cmap->refs >= 0) |
||
62 | cmap->refs ++; |
||
63 | return cmap; |
||
64 | } |
||
65 | |||
66 | void |
||
67 | pdf_drop_cmap(pdf_cmap *cmap) |
||
68 | { |
||
69 | if (cmap->refs >= 0) |
||
70 | { |
||
71 | if (--cmap->refs == 0) |
||
72 | { |
||
73 | if (cmap->usecmap) |
||
74 | pdf_drop_cmap(cmap->usecmap); |
||
75 | fz_free(cmap->ranges); |
||
76 | fz_free(cmap->table); |
||
77 | fz_free(cmap); |
||
78 | } |
||
79 | } |
||
80 | } |
||
81 | |||
82 | void |
||
83 | pdf_set_usecmap(pdf_cmap *cmap, pdf_cmap *usecmap) |
||
84 | { |
||
85 | int i; |
||
86 | |||
87 | if (cmap->usecmap) |
||
88 | pdf_drop_cmap(cmap->usecmap); |
||
89 | cmap->usecmap = pdf_keep_cmap(usecmap); |
||
90 | |||
91 | if (cmap->codespace_len == 0) |
||
92 | { |
||
93 | cmap->codespace_len = usecmap->codespace_len; |
||
94 | for (i = 0; i < usecmap->codespace_len; i++) |
||
95 | cmap->codespace[i] = usecmap->codespace[i]; |
||
96 | } |
||
97 | } |
||
98 | |||
99 | int |
||
100 | pdf_get_wmode(pdf_cmap *cmap) |
||
101 | { |
||
102 | return cmap->wmode; |
||
103 | } |
||
104 | |||
105 | void |
||
106 | pdf_set_wmode(pdf_cmap *cmap, int wmode) |
||
107 | { |
||
108 | cmap->wmode = wmode; |
||
109 | } |
||
110 | |||
111 | void |
||
112 | pdf_debug_cmap(pdf_cmap *cmap) |
||
113 | { |
||
114 | int i, k, n; |
||
115 | |||
116 | printf("cmap $%p /%s {\n", (void *) cmap, cmap->cmap_name); |
||
117 | |||
118 | if (cmap->usecmap_name[0]) |
||
119 | printf("\tusecmap /%s\n", cmap->usecmap_name); |
||
120 | if (cmap->usecmap) |
||
121 | printf("\tusecmap $%p\n", (void *) cmap->usecmap); |
||
122 | |||
123 | printf("\twmode %d\n", cmap->wmode); |
||
124 | |||
125 | printf("\tcodespaces {\n"); |
||
126 | for (i = 0; i < cmap->codespace_len; i++) |
||
127 | { |
||
128 | printf("\t\t<%x> <%x>\n", cmap->codespace[i].low, cmap->codespace[i].high); |
||
129 | } |
||
130 | printf("\t}\n"); |
||
131 | |||
132 | printf("\tranges (%d,%d) {\n", cmap->rlen, cmap->tlen); |
||
133 | for (i = 0; i < cmap->rlen; i++) |
||
134 | { |
||
135 | pdf_range *r = &cmap->ranges[i]; |
||
136 | printf("\t\t<%04x> <%04x> ", r->low, pdf_range_high(r)); |
||
137 | if (pdf_range_flags(r) == PDF_CMAP_TABLE) |
||
138 | { |
||
139 | printf("[ "); |
||
140 | for (k = 0; k < pdf_range_high(r) - r->low + 1; k++) |
||
141 | printf("%d ", cmap->table[r->offset + k]); |
||
142 | printf("]\n"); |
||
143 | } |
||
144 | else if (pdf_range_flags(r) == PDF_CMAP_MULTI) |
||
145 | { |
||
146 | printf("< "); |
||
147 | n = cmap->table[r->offset]; |
||
148 | for (k = 0; k < n; k++) |
||
149 | printf("%04x ", cmap->table[r->offset + 1 + k]); |
||
150 | printf(">\n"); |
||
151 | } |
||
152 | else |
||
153 | printf("%d\n", r->offset); |
||
154 | } |
||
155 | printf("\t}\n}\n"); |
||
156 | } |
||
157 | |||
158 | /* |
||
159 | * Add a codespacerange section. |
||
160 | * These ranges are used by pdf_decode_cmap to decode |
||
161 | * multi-byte encoded strings. |
||
162 | */ |
||
163 | void |
||
164 | pdf_add_codespace(pdf_cmap *cmap, int low, int high, int n) |
||
165 | { |
||
166 | if (cmap->codespace_len + 1 == nelem(cmap->codespace)) |
||
167 | { |
||
168 | fz_warn("assert: too many code space ranges"); |
||
169 | return; |
||
170 | } |
||
171 | |||
172 | cmap->codespace[cmap->codespace_len].n = n; |
||
173 | cmap->codespace[cmap->codespace_len].low = low; |
||
174 | cmap->codespace[cmap->codespace_len].high = high; |
||
175 | cmap->codespace_len ++; |
||
176 | } |
||
177 | |||
178 | /* |
||
179 | * Add an integer to the table. |
||
180 | */ |
||
181 | static void |
||
182 | add_table(pdf_cmap *cmap, int value) |
||
183 | { |
||
184 | if (cmap->tlen == USHRT_MAX) |
||
185 | { |
||
186 | fz_warn("cmap table is full; ignoring additional entries"); |
||
187 | return; |
||
188 | } |
||
189 | if (cmap->tlen + 1 > cmap->tcap) |
||
190 | { |
||
191 | cmap->tcap = cmap->tcap > 1 ? (cmap->tcap * 3) / 2 : 256; |
||
192 | cmap->table = fz_realloc(cmap->table, cmap->tcap, sizeof(unsigned short)); |
||
193 | } |
||
194 | cmap->table[cmap->tlen++] = value; |
||
195 | } |
||
196 | |||
197 | /* |
||
198 | * Add a range. |
||
199 | */ |
||
200 | static void |
||
201 | add_range(pdf_cmap *cmap, int low, int high, int flag, int offset) |
||
202 | { |
||
203 | /* If the range is too large to be represented, split it */ |
||
204 | if (high - low > 0x3fff) |
||
205 | { |
||
206 | add_range(cmap, low, low+0x3fff, flag, offset); |
||
207 | add_range(cmap, low+0x3fff, high, flag, offset+0x3fff); |
||
208 | return; |
||
209 | } |
||
210 | if (cmap->rlen + 1 > cmap->rcap) |
||
211 | { |
||
212 | cmap->rcap = cmap->rcap > 1 ? (cmap->rcap * 3) / 2 : 256; |
||
213 | cmap->ranges = fz_realloc(cmap->ranges, cmap->rcap, sizeof(pdf_range)); |
||
214 | } |
||
215 | cmap->ranges[cmap->rlen].low = low; |
||
216 | pdf_range_set_high(&cmap->ranges[cmap->rlen], high); |
||
217 | pdf_range_set_flags(&cmap->ranges[cmap->rlen], flag); |
||
218 | cmap->ranges[cmap->rlen].offset = offset; |
||
219 | cmap->rlen ++; |
||
220 | } |
||
221 | |||
222 | /* |
||
223 | * Add a range-to-table mapping. |
||
224 | */ |
||
225 | void |
||
226 | pdf_map_range_to_table(pdf_cmap *cmap, int low, int *table, int len) |
||
227 | { |
||
228 | int i; |
||
229 | int high = low + len; |
||
230 | int offset = cmap->tlen; |
||
231 | if (cmap->tlen + len >= USHRT_MAX) |
||
232 | fz_warn("cannot map range to table; table is full"); |
||
233 | else |
||
234 | { |
||
235 | for (i = 0; i < len; i++) |
||
236 | add_table(cmap, table[i]); |
||
237 | add_range(cmap, low, high, PDF_CMAP_TABLE, offset); |
||
238 | } |
||
239 | } |
||
240 | |||
241 | /* |
||
242 | * Add a range of contiguous one-to-one mappings (ie 1..5 maps to 21..25) |
||
243 | */ |
||
244 | void |
||
245 | pdf_map_range_to_range(pdf_cmap *cmap, int low, int high, int offset) |
||
246 | { |
||
247 | add_range(cmap, low, high, high - low == 0 ? PDF_CMAP_SINGLE : PDF_CMAP_RANGE, offset); |
||
248 | } |
||
249 | |||
250 | /* |
||
251 | * Add a single one-to-many mapping. |
||
252 | */ |
||
253 | void |
||
254 | pdf_map_one_to_many(pdf_cmap *cmap, int low, int *values, int len) |
||
255 | { |
||
256 | int offset, i; |
||
257 | |||
258 | if (len == 1) |
||
259 | { |
||
260 | add_range(cmap, low, low, PDF_CMAP_SINGLE, values[0]); |
||
261 | return; |
||
262 | } |
||
263 | |||
264 | if (len > 8) |
||
265 | { |
||
266 | fz_warn("one to many mapping is too large (%d); truncating", len); |
||
267 | len = 8; |
||
268 | } |
||
269 | |||
270 | if (len == 2 && |
||
271 | values[0] >= 0xD800 && values[0] <= 0xDBFF && |
||
272 | values[1] >= 0xDC00 && values[1] <= 0xDFFF) |
||
273 | { |
||
274 | fz_warn("ignoring surrogate pair mapping in cmap"); |
||
275 | return; |
||
276 | } |
||
277 | |||
278 | if (cmap->tlen + len + 1 >= USHRT_MAX) |
||
279 | fz_warn("cannot map one to many; table is full"); |
||
280 | else |
||
281 | { |
||
282 | offset = cmap->tlen; |
||
283 | add_table(cmap, len); |
||
284 | for (i = 0; i < len; i++) |
||
285 | add_table(cmap, values[i]); |
||
286 | add_range(cmap, low, low, PDF_CMAP_MULTI, offset); |
||
287 | } |
||
288 | } |
||
289 | |||
290 | /* |
||
291 | * Sort the input ranges. |
||
292 | * Merge contiguous input ranges to range-to-range if the output is contiguous. |
||
293 | * Merge contiguous input ranges to range-to-table if the output is random. |
||
294 | */ |
||
295 | |||
296 | static int cmprange(const void *va, const void *vb) |
||
297 | { |
||
298 | return ((const pdf_range*)va)->low - ((const pdf_range*)vb)->low; |
||
299 | } |
||
300 | |||
301 | void |
||
302 | pdf_sort_cmap(pdf_cmap *cmap) |
||
303 | { |
||
304 | pdf_range *a; /* last written range on output */ |
||
305 | pdf_range *b; /* current range examined on input */ |
||
306 | |||
307 | if (cmap->rlen == 0) |
||
308 | return; |
||
309 | |||
310 | qsort(cmap->ranges, cmap->rlen, sizeof(pdf_range), cmprange); |
||
311 | |||
312 | if (cmap->tlen == USHRT_MAX) |
||
313 | { |
||
314 | fz_warn("cmap table is full; will not combine ranges"); |
||
315 | return; |
||
316 | } |
||
317 | |||
318 | a = cmap->ranges; |
||
319 | b = cmap->ranges + 1; |
||
320 | |||
321 | while (b < cmap->ranges + cmap->rlen) |
||
322 | { |
||
323 | /* ignore one-to-many mappings */ |
||
324 | if (pdf_range_flags(b) == PDF_CMAP_MULTI) |
||
325 | { |
||
326 | *(++a) = *b; |
||
327 | } |
||
328 | |||
329 | /* input contiguous */ |
||
330 | else if (pdf_range_high(a) + 1 == b->low) |
||
331 | { |
||
332 | /* output contiguous */ |
||
333 | if (pdf_range_high(a) - a->low + a->offset + 1 == b->offset) |
||
334 | { |
||
335 | /* SR -> R and SS -> R and RR -> R and RS -> R */ |
||
336 | if ((pdf_range_flags(a) == PDF_CMAP_SINGLE || pdf_range_flags(a) == PDF_CMAP_RANGE) && (pdf_range_high(b) - a->low <= 0x3fff)) |
||
337 | { |
||
338 | pdf_range_set_flags(a, PDF_CMAP_RANGE); |
||
339 | pdf_range_set_high(a, pdf_range_high(b)); |
||
340 | } |
||
341 | |||
342 | /* LS -> L */ |
||
343 | else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_SINGLE && (pdf_range_high(b) - a->low <= 0x3fff)) |
||
344 | { |
||
345 | pdf_range_set_high(a, pdf_range_high(b)); |
||
346 | add_table(cmap, b->offset); |
||
347 | } |
||
348 | |||
349 | /* LR -> LR */ |
||
350 | else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_RANGE) |
||
351 | { |
||
352 | *(++a) = *b; |
||
353 | } |
||
354 | |||
355 | /* XX -> XX */ |
||
356 | else |
||
357 | { |
||
358 | *(++a) = *b; |
||
359 | } |
||
360 | } |
||
361 | |||
362 | /* output separated */ |
||
363 | else |
||
364 | { |
||
365 | /* SS -> L */ |
||
366 | if (pdf_range_flags(a) == PDF_CMAP_SINGLE && pdf_range_flags(b) == PDF_CMAP_SINGLE) |
||
367 | { |
||
368 | pdf_range_set_flags(a, PDF_CMAP_TABLE); |
||
369 | pdf_range_set_high(a, pdf_range_high(b)); |
||
370 | add_table(cmap, a->offset); |
||
371 | add_table(cmap, b->offset); |
||
372 | a->offset = cmap->tlen - 2; |
||
373 | } |
||
374 | |||
375 | /* LS -> L */ |
||
376 | else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_SINGLE && (pdf_range_high(b) - a->low <= 0x3fff)) |
||
377 | { |
||
378 | pdf_range_set_high(a, pdf_range_high(b)); |
||
379 | add_table(cmap, b->offset); |
||
380 | } |
||
381 | |||
382 | /* XX -> XX */ |
||
383 | else |
||
384 | { |
||
385 | *(++a) = *b; |
||
386 | } |
||
387 | } |
||
388 | } |
||
389 | |||
390 | /* input separated: XX -> XX */ |
||
391 | else |
||
392 | { |
||
393 | *(++a) = *b; |
||
394 | } |
||
395 | |||
396 | b ++; |
||
397 | } |
||
398 | |||
399 | cmap->rlen = a - cmap->ranges + 1; |
||
400 | |||
401 | fz_flush_warnings(); |
||
402 | } |
||
403 | |||
404 | /* |
||
405 | * Lookup the mapping of a codepoint. |
||
406 | */ |
||
407 | int |
||
408 | pdf_lookup_cmap(pdf_cmap *cmap, int cpt) |
||
409 | { |
||
410 | int l = 0; |
||
411 | int r = cmap->rlen - 1; |
||
412 | int m; |
||
413 | |||
414 | while (l <= r) |
||
415 | { |
||
416 | m = (l + r) >> 1; |
||
417 | if (cpt < cmap->ranges[m].low) |
||
418 | r = m - 1; |
||
419 | else if (cpt > pdf_range_high(&cmap->ranges[m])) |
||
420 | l = m + 1; |
||
421 | else |
||
422 | { |
||
423 | int i = cpt - cmap->ranges[m].low + cmap->ranges[m].offset; |
||
424 | if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_TABLE) |
||
425 | return cmap->table[i]; |
||
426 | if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_MULTI) |
||
427 | return -1; /* should use lookup_cmap_full */ |
||
428 | return i; |
||
429 | } |
||
430 | } |
||
431 | |||
432 | if (cmap->usecmap) |
||
433 | return pdf_lookup_cmap(cmap->usecmap, cpt); |
||
434 | |||
435 | return -1; |
||
436 | } |
||
437 | |||
438 | int |
||
439 | pdf_lookup_cmap_full(pdf_cmap *cmap, int cpt, int *out) |
||
440 | { |
||
441 | int i, k, n; |
||
442 | int l = 0; |
||
443 | int r = cmap->rlen - 1; |
||
444 | int m; |
||
445 | |||
446 | while (l <= r) |
||
447 | { |
||
448 | m = (l + r) >> 1; |
||
449 | if (cpt < cmap->ranges[m].low) |
||
450 | r = m - 1; |
||
451 | else if (cpt > pdf_range_high(&cmap->ranges[m])) |
||
452 | l = m + 1; |
||
453 | else |
||
454 | { |
||
455 | k = cpt - cmap->ranges[m].low + cmap->ranges[m].offset; |
||
456 | if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_TABLE) |
||
457 | { |
||
458 | out[0] = cmap->table[k]; |
||
459 | return 1; |
||
460 | } |
||
461 | else if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_MULTI) |
||
462 | { |
||
463 | n = cmap->ranges[m].offset; |
||
464 | for (i = 0; i < cmap->table[n]; i++) |
||
465 | out[i] = cmap->table[n + i + 1]; |
||
466 | return cmap->table[n]; |
||
467 | } |
||
468 | else |
||
469 | { |
||
470 | out[0] = k; |
||
471 | return 1; |
||
472 | } |
||
473 | } |
||
474 | } |
||
475 | |||
476 | if (cmap->usecmap) |
||
477 | return pdf_lookup_cmap_full(cmap->usecmap, cpt, out); |
||
478 | |||
479 | return 0; |
||
480 | } |
||
481 | |||
482 | /* |
||
483 | * Use the codespace ranges to extract a codepoint from a |
||
484 | * multi-byte encoded string. |
||
485 | */ |
||
486 | unsigned char * |
||
487 | pdf_decode_cmap(pdf_cmap *cmap, unsigned char *buf, int *cpt) |
||
488 | { |
||
489 | int k, n, c; |
||
490 | |||
491 | c = 0; |
||
492 | for (n = 0; n < 4; n++) |
||
493 | { |
||
494 | c = (c << 8) | buf[n]; |
||
495 | for (k = 0; k < cmap->codespace_len; k++) |
||
496 | { |
||
497 | if (cmap->codespace[k].n == n + 1) |
||
498 | { |
||
499 | if (c >= cmap->codespace[k].low && c <= cmap->codespace[k].high) |
||
500 | { |
||
501 | *cpt = c; |
||
502 | return buf + n + 1; |
||
503 | } |
||
504 | } |
||
505 | } |
||
506 | } |
||
507 | |||
508 | *cpt = 0; |
||
509 | return buf + 1; |
||
510 | }=>>><>>>>=>>=>=>=>=>>>=>=>>>>>%04x>%04x>>%x>%x>>>><> |