Rev 4680 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4680 | right-hear | 1 | #include "fitz.h" |
2 | #include "mupdf.h" |
||
3 | |||
4 | #define IS_NUMBER \ |
||
5 | '+':case'-':case'.':case'0':case'1':case'2':case'3':\ |
||
6 | case'4':case'5':case'6':case'7':case'8':case'9' |
||
7 | #define IS_WHITE \ |
||
8 | '\000':case'\011':case'\012':case'\014':case'\015':case'\040' |
||
9 | #define IS_HEX \ |
||
10 | '0':case'1':case'2':case'3':case'4':case'5':case'6':\ |
||
11 | case'7':case'8':case'9':case'A':case'B':case'C':\ |
||
12 | case'D':case'E':case'F':case'a':case'b':case'c':\ |
||
13 | case'd':case'e':case'f' |
||
14 | #define IS_DELIM \ |
||
15 | '(':case')':case'<':case'>':case'[':case']':case'{':\ |
||
16 | case'}':case'/':case'%' |
||
17 | |||
18 | #define RANGE_0_9 \ |
||
19 | '0':case'1':case'2':case'3':case'4':case'5':\ |
||
20 | case'6':case'7':case'8':case'9' |
||
21 | #define RANGE_a_f \ |
||
22 | 'a':case'b':case'c':case'd':case'e':case'f' |
||
23 | #define RANGE_A_F \ |
||
24 | 'A':case'B':case'C':case'D':case'E':case'F' |
||
25 | |||
26 | static inline int iswhite(int ch) |
||
27 | { |
||
28 | return |
||
29 | ch == '\000' || |
||
30 | ch == '\011' || |
||
31 | ch == '\012' || |
||
32 | ch == '\014' || |
||
33 | ch == '\015' || |
||
34 | ch == '\040'; |
||
35 | } |
||
36 | |||
37 | static inline int unhex(int ch) |
||
38 | { |
||
39 | if (ch >= '0' && ch <= '9') return ch - '0'; |
||
40 | if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA; |
||
41 | if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA; |
||
42 | return 0; |
||
43 | } |
||
44 | |||
45 | static void |
||
46 | lex_white(fz_stream *f) |
||
47 | { |
||
48 | int c; |
||
49 | do { |
||
50 | c = fz_read_byte(f); |
||
51 | } while ((c <= 32) && (iswhite(c))); |
||
52 | if (c != EOF) |
||
53 | fz_unread_byte(f); |
||
54 | } |
||
55 | |||
56 | static void |
||
57 | lex_comment(fz_stream *f) |
||
58 | { |
||
59 | int c; |
||
60 | do { |
||
61 | c = fz_read_byte(f); |
||
62 | } while ((c != '\012') && (c != '\015') && (c != EOF)); |
||
63 | } |
||
64 | |||
65 | static int |
||
66 | lex_number(fz_stream *f, char *s, int n, int *tok) |
||
67 | { |
||
68 | char *buf = s; |
||
69 | *tok = PDF_TOK_INT; |
||
70 | |||
71 | /* Initially we might have +, -, . or a digit */ |
||
72 | if (n > 1) |
||
73 | { |
||
74 | int c = fz_read_byte(f); |
||
75 | switch (c) |
||
76 | { |
||
77 | case '.': |
||
78 | *tok = PDF_TOK_REAL; |
||
79 | *s++ = c; |
||
80 | n--; |
||
81 | goto loop_after_dot; |
||
82 | case '+': |
||
83 | case '-': |
||
84 | case RANGE_0_9: |
||
85 | *s++ = c; |
||
86 | n--; |
||
87 | goto loop_after_sign; |
||
88 | default: |
||
89 | fz_unread_byte(f); |
||
90 | goto end; |
||
91 | case EOF: |
||
92 | goto end; |
||
93 | } |
||
94 | } |
||
95 | |||
96 | /* We can't accept a sign from here on in, just . or a digit */ |
||
97 | loop_after_sign: |
||
98 | while (n > 1) |
||
99 | { |
||
100 | int c = fz_read_byte(f); |
||
101 | switch (c) |
||
102 | { |
||
103 | case '.': |
||
104 | *tok = PDF_TOK_REAL; |
||
105 | *s++ = c; |
||
106 | n--; |
||
107 | goto loop_after_dot; |
||
108 | case RANGE_0_9: |
||
109 | *s++ = c; |
||
110 | break; |
||
111 | default: |
||
112 | fz_unread_byte(f); |
||
113 | goto end; |
||
114 | case EOF: |
||
115 | goto end; |
||
116 | } |
||
117 | n--; |
||
118 | } |
||
119 | |||
120 | /* In here, we've seen a dot, so can accept just digits */ |
||
121 | loop_after_dot: |
||
122 | while (n > 1) |
||
123 | { |
||
124 | int c = fz_read_byte(f); |
||
125 | switch (c) |
||
126 | { |
||
127 | case RANGE_0_9: |
||
128 | *s++ = c; |
||
129 | break; |
||
130 | default: |
||
131 | fz_unread_byte(f); |
||
132 | goto end; |
||
133 | case EOF: |
||
134 | goto end; |
||
135 | } |
||
136 | n--; |
||
137 | } |
||
138 | |||
139 | end: |
||
140 | *s = '\0'; |
||
141 | return s-buf; |
||
142 | } |
||
143 | |||
144 | static void |
||
145 | lex_name(fz_stream *f, char *s, int n) |
||
146 | { |
||
147 | while (n > 1) |
||
148 | { |
||
149 | int c = fz_read_byte(f); |
||
150 | switch (c) |
||
151 | { |
||
152 | case IS_WHITE: |
||
153 | case IS_DELIM: |
||
154 | fz_unread_byte(f); |
||
155 | goto end; |
||
156 | case EOF: |
||
157 | goto end; |
||
158 | case '#': |
||
159 | { |
||
160 | int d; |
||
161 | c = fz_read_byte(f); |
||
162 | switch (c) |
||
163 | { |
||
164 | case RANGE_0_9: |
||
165 | d = (c - '0') << 4; |
||
166 | break; |
||
167 | case RANGE_a_f: |
||
168 | d = (c - 'a' + 10) << 4; |
||
169 | break; |
||
170 | case RANGE_A_F: |
||
171 | d = (c - 'A' + 10) << 4; |
||
172 | break; |
||
173 | default: |
||
174 | fz_unread_byte(f); |
||
175 | /* fallthrough */ |
||
176 | case EOF: |
||
177 | goto end; |
||
178 | } |
||
179 | c = fz_read_byte(f); |
||
180 | switch (c) |
||
181 | { |
||
182 | case RANGE_0_9: |
||
183 | c -= '0'; |
||
184 | break; |
||
185 | case RANGE_a_f: |
||
186 | c -= 'a' - 10; |
||
187 | break; |
||
188 | case RANGE_A_F: |
||
189 | c -= 'A' - 10; |
||
190 | break; |
||
191 | default: |
||
192 | fz_unread_byte(f); |
||
193 | /* fallthrough */ |
||
194 | case EOF: |
||
195 | *s++ = d; |
||
196 | n--; |
||
197 | goto end; |
||
198 | } |
||
199 | *s++ = d + c; |
||
200 | n--; |
||
201 | break; |
||
202 | } |
||
203 | default: |
||
204 | *s++ = c; |
||
205 | n--; |
||
206 | break; |
||
207 | } |
||
208 | } |
||
209 | end: |
||
210 | *s = '\0'; |
||
211 | } |
||
212 | |||
213 | static int |
||
214 | lex_string(fz_stream *f, char *buf, int n) |
||
215 | { |
||
216 | char *s = buf; |
||
217 | char *e = buf + n; |
||
218 | int bal = 1; |
||
219 | int oct; |
||
220 | int c; |
||
221 | |||
222 | while (s < e) |
||
223 | { |
||
224 | c = fz_read_byte(f); |
||
225 | switch (c) |
||
226 | { |
||
227 | case EOF: |
||
228 | goto end; |
||
229 | case '(': |
||
230 | bal++; |
||
231 | *s++ = c; |
||
232 | break; |
||
233 | case ')': |
||
234 | bal --; |
||
235 | if (bal == 0) |
||
236 | goto end; |
||
237 | *s++ = c; |
||
238 | break; |
||
239 | case '\\': |
||
240 | c = fz_read_byte(f); |
||
241 | switch (c) |
||
242 | { |
||
243 | case EOF: |
||
244 | goto end; |
||
245 | case 'n': |
||
246 | *s++ = '\n'; |
||
247 | break; |
||
248 | case 'r': |
||
249 | *s++ = '\r'; |
||
250 | break; |
||
251 | case 't': |
||
252 | *s++ = '\t'; |
||
253 | break; |
||
254 | case 'b': |
||
255 | *s++ = '\b'; |
||
256 | break; |
||
257 | case 'f': |
||
258 | *s++ = '\f'; |
||
259 | break; |
||
260 | case '(': |
||
261 | *s++ = '('; |
||
262 | break; |
||
263 | case ')': |
||
264 | *s++ = ')'; |
||
265 | break; |
||
266 | case '\\': |
||
267 | *s++ = '\\'; |
||
268 | break; |
||
269 | case RANGE_0_9: |
||
270 | oct = c - '0'; |
||
271 | c = fz_read_byte(f); |
||
272 | if (c >= '0' && c <= '9') |
||
273 | { |
||
274 | oct = oct * 8 + (c - '0'); |
||
275 | c = fz_read_byte(f); |
||
276 | if (c >= '0' && c <= '9') |
||
277 | oct = oct * 8 + (c - '0'); |
||
278 | else if (c != EOF) |
||
279 | fz_unread_byte(f); |
||
280 | } |
||
281 | else if (c != EOF) |
||
282 | fz_unread_byte(f); |
||
283 | *s++ = oct; |
||
284 | break; |
||
285 | case '\n': |
||
286 | break; |
||
287 | case '\r': |
||
288 | c = fz_read_byte(f); |
||
289 | if ((c != '\n') && (c != EOF)) |
||
290 | fz_unread_byte(f); |
||
291 | break; |
||
292 | default: |
||
293 | *s++ = c; |
||
294 | } |
||
295 | break; |
||
296 | default: |
||
297 | *s++ = c; |
||
298 | break; |
||
299 | } |
||
300 | } |
||
301 | end: |
||
302 | return s - buf; |
||
303 | } |
||
304 | |||
305 | static int |
||
306 | lex_hex_string(fz_stream *f, char *buf, int n) |
||
307 | { |
||
308 | char *s = buf; |
||
309 | char *e = buf + n; |
||
310 | int a = 0, x = 0; |
||
311 | int c; |
||
312 | |||
313 | while (s < e) |
||
314 | { |
||
315 | c = fz_read_byte(f); |
||
316 | switch (c) |
||
317 | { |
||
318 | case IS_WHITE: |
||
319 | break; |
||
320 | case IS_HEX: |
||
321 | if (x) |
||
322 | { |
||
323 | *s++ = a * 16 + unhex(c); |
||
324 | x = !x; |
||
325 | } |
||
326 | else |
||
327 | { |
||
328 | a = unhex(c); |
||
329 | x = !x; |
||
330 | } |
||
331 | break; |
||
332 | case '>': |
||
333 | case EOF: |
||
334 | goto end; |
||
335 | default: |
||
336 | fz_warn("ignoring invalid character in hex string: '%c'", c); |
||
337 | } |
||
338 | } |
||
339 | end: |
||
340 | return s - buf; |
||
341 | } |
||
342 | |||
343 | static int |
||
344 | pdf_token_from_keyword(char *key) |
||
345 | { |
||
346 | switch (*key) |
||
347 | { |
||
348 | case 'R': |
||
349 | if (!strcmp(key, "R")) return PDF_TOK_R; |
||
350 | break; |
||
351 | case 't': |
||
352 | if (!strcmp(key, "true")) return PDF_TOK_TRUE; |
||
353 | if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER; |
||
354 | break; |
||
355 | case 'f': |
||
356 | if (!strcmp(key, "false")) return PDF_TOK_FALSE; |
||
357 | break; |
||
358 | case 'n': |
||
359 | if (!strcmp(key, "null")) return PDF_TOK_NULL; |
||
360 | break; |
||
361 | case 'o': |
||
362 | if (!strcmp(key, "obj")) return PDF_TOK_OBJ; |
||
363 | break; |
||
364 | case 'e': |
||
365 | if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ; |
||
366 | if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM; |
||
367 | break; |
||
368 | case 's': |
||
369 | if (!strcmp(key, "stream")) return PDF_TOK_STREAM; |
||
370 | if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF; |
||
371 | break; |
||
372 | case 'x': |
||
373 | if (!strcmp(key, "xref")) return PDF_TOK_XREF; |
||
374 | break; |
||
375 | default: |
||
376 | break; |
||
377 | } |
||
378 | |||
379 | return PDF_TOK_KEYWORD; |
||
380 | } |
||
381 | |||
382 | fz_error |
||
383 | pdf_lex(int *tok, fz_stream *f, char *buf, int n, int *sl) |
||
384 | { |
||
385 | while (1) |
||
386 | { |
||
387 | int c = fz_read_byte(f); |
||
388 | switch (c) |
||
389 | { |
||
390 | case EOF: |
||
391 | *tok = PDF_TOK_EOF; |
||
392 | return fz_okay; |
||
393 | case IS_WHITE: |
||
394 | lex_white(f); |
||
395 | break; |
||
396 | case '%': |
||
397 | lex_comment(f); |
||
398 | break; |
||
399 | case '/': |
||
400 | lex_name(f, buf, n); |
||
401 | *sl = strlen(buf); |
||
402 | *tok = PDF_TOK_NAME; |
||
403 | return fz_okay; |
||
404 | case '(': |
||
405 | *sl = lex_string(f, buf, n); |
||
406 | *tok = PDF_TOK_STRING; |
||
407 | return fz_okay; |
||
408 | case ')': |
||
409 | *tok = PDF_TOK_ERROR; |
||
410 | goto cleanuperror; |
||
411 | case '<': |
||
412 | c = fz_read_byte(f); |
||
413 | if (c == '<') |
||
414 | { |
||
415 | *tok = PDF_TOK_OPEN_DICT; |
||
416 | } |
||
417 | else |
||
418 | { |
||
419 | fz_unread_byte(f); |
||
420 | *sl = lex_hex_string(f, buf, n); |
||
421 | *tok = PDF_TOK_STRING; |
||
422 | } |
||
423 | return fz_okay; |
||
424 | case '>': |
||
425 | c = fz_read_byte(f); |
||
426 | if (c == '>') |
||
427 | { |
||
428 | *tok = PDF_TOK_CLOSE_DICT; |
||
429 | return fz_okay; |
||
430 | } |
||
431 | *tok = PDF_TOK_ERROR; |
||
432 | goto cleanuperror; |
||
433 | case '[': |
||
434 | *tok = PDF_TOK_OPEN_ARRAY; |
||
435 | return fz_okay; |
||
436 | case ']': |
||
437 | *tok = PDF_TOK_CLOSE_ARRAY; |
||
438 | return fz_okay; |
||
439 | case '{': |
||
440 | *tok = PDF_TOK_OPEN_BRACE; |
||
441 | return fz_okay; |
||
442 | case '}': |
||
443 | *tok = PDF_TOK_CLOSE_BRACE; |
||
444 | return fz_okay; |
||
445 | case IS_NUMBER: |
||
446 | fz_unread_byte(f); |
||
447 | *sl = lex_number(f, buf, n, tok); |
||
448 | return fz_okay; |
||
449 | default: /* isregular: !isdelim && !iswhite && c != EOF */ |
||
450 | fz_unread_byte(f); |
||
451 | lex_name(f, buf, n); |
||
452 | *sl = strlen(buf); |
||
453 | *tok = pdf_token_from_keyword(buf); |
||
454 | return fz_okay; |
||
455 | } |
||
456 | } |
||
457 | |||
458 | cleanuperror: |
||
459 | *tok = PDF_TOK_ERROR; |
||
460 | return fz_throw("lexical error"); |
||
461 | }') |