Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
8335 | maxcodehac | 1 | |
2 | GNU UnRTF, a command-line program to convert RTF documents to other formats. |
||
3 | Copyright (C) 2000,2001 Zachary Thayer Smith |
||
4 | |||
5 | |||
6 | it under the terms of the GNU General Public License as published by |
||
7 | the Free Software Foundation; either version 2 of the License, or |
||
8 | (at your option) any later version. |
||
9 | |||
10 | |||
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
13 | GNU General Public License for more details. |
||
14 | |||
15 | |||
16 | along with this program; if not, write to the Free Software |
||
17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
||
18 | |||
19 | |||
20 | =============================================================================*/ |
||
21 | |||
22 | |||
23 | |||
24 | * Module name: parse |
||
25 | * Author name: Zach Smith |
||
26 | * Create date: 01 Sep 00 |
||
27 | * Purpose: Parsing of the RTF file into a structure of Word objects. |
||
28 | *---------------------------------------------------------------------- |
||
29 | * Changes: |
||
30 | * 15 Oct 00, tuorfa@yahoo.com: parse.c created with functions taken from word.c |
||
31 | * 15 Oct 00, tuorfa@yahoo.com: backslash before newline is now \par |
||
32 | * 08 Apr 01, tuorfa@yahoo.com: removed limit on word length |
||
33 | * 03 Aug 01, tuorfa@yahoo.com: added input buffering |
||
34 | * 19 Sep 01, tuorfa@yahoo.com: cleaned up read_word() |
||
35 | * 22 Sep 01, tuorfa@yahoo.com: moved word_dump() to word.c |
||
36 | * 22 Sep 01, tuorfa@yahoo.com: added function-level comment blocks |
||
37 | *--------------------------------------------------------------------*/ |
||
38 | |||
39 | |||
40 | #include |
||
41 | #include |
||
42 | #include |
||
43 | |||
44 | |||
45 | #include "parse.h" |
||
46 | #include "malloc.h" |
||
47 | #include "main.h" |
||
48 | #include "error.h" |
||
49 | #include "word.h" |
||
50 | #include "hash.h" |
||
51 | |||
52 | |||
53 | |||
54 | |||
55 | static int ungot_char=-1; |
||
56 | static int ungot_char2=-1; |
||
57 | static int ungot_char3=-1; |
||
58 | |||
59 | |||
60 | |||
61 | |||
62 | * Name: my_unget_char |
||
63 | * Purpose: My own unget routine, handling up to 3 ungot characters. |
||
64 | * Args: Character. |
||
65 | * Returns: None. |
||
66 | *=======================================================================*/ |
||
67 | |||
68 | |||
69 | { |
||
70 | if (ungot_char>=0 && ungot_char2>=0 && ungot_char3>=0) |
||
71 | error_handler ("more than 3 ungot chars"); |
||
72 | |||
73 | |||
74 | ungot_char2 = ungot_char; |
||
75 | ungot_char = ch; |
||
76 | } |
||
77 | |||
78 | |||
79 | |||
80 | |||
81 | |||
82 | |||
83 | static int buffer_size = 0; |
||
84 | static char *read_buf = NULL; |
||
85 | static int read_buf_end = 0; |
||
86 | static int read_buf_index = 0; |
||
87 | |||
88 | |||
89 | |||
90 | |||
91 | |||
92 | |||
93 | * Name: my_getchar |
||
94 | * Purpose: Gets a character: either an ungot one, or a buffered one. |
||
95 | * Args: Input file. |
||
96 | * Returns: Character, or EOF. |
||
97 | *=======================================================================*/ |
||
98 | |||
99 | |||
100 | { |
||
101 | int ch; |
||
102 | |||
103 | |||
104 | |||
105 | |||
106 | ch = ungot_char; |
||
107 | ungot_char=ungot_char2; |
||
108 | ungot_char2=ungot_char3; |
||
109 | ungot_char3=-1; |
||
110 | last_returned_ch = ch; |
||
111 | return ch; |
||
112 | } |
||
113 | do { |
||
114 | if (read_buf_index >= read_buf_end) { |
||
115 | if (!read_buf) { |
||
116 | buffer_size = READ_BUF_LEN; |
||
117 | read_buf = my_malloc (buffer_size); |
||
118 | if (!read_buf) { |
||
119 | buffer_size /= 4; |
||
120 | read_buf = my_malloc (buffer_size); |
||
121 | if (!read_buf) |
||
122 | error_handler ("cannot allocate read buffer"); |
||
123 | } |
||
124 | } |
||
125 | read_buf_end = fread (read_buf, 1, buffer_size, f); |
||
126 | read_buf_index = 0; |
||
127 | if (!read_buf_end) |
||
128 | return EOF; |
||
129 | } |
||
130 | ch = read_buf [read_buf_index++]; |
||
131 | |||
132 | |||
133 | lineno++; |
||
134 | /* Convert \(newline) into \par here */ |
||
135 | if (last_returned_ch=='\\') { |
||
136 | my_unget_char (' '); |
||
137 | my_unget_char ('r'); |
||
138 | my_unget_char ('a'); |
||
139 | ch = 'p'; |
||
140 | break; |
||
141 | } |
||
142 | } |
||
143 | } |
||
144 | while (ch=='\r' /* || ch=='\n' */ ); |
||
145 | |||
146 | |||
147 | |||
148 | |||
149 | return ch; |
||
150 | } |
||
151 | |||
152 | |||
153 | |||
154 | static char *input_str = NULL; |
||
155 | static unsigned long current_max_length = 1; |
||
156 | |||
157 | |||
158 | |||
159 | |||
160 | * Name: expand_word_buffer |
||
161 | * Purpose: Expands the buffer used to store an incoming word. |
||
162 | * This allows us to remove the limit on word length. |
||
163 | * Args: None. |
||
164 | * Returns: None. |
||
165 | *=======================================================================*/ |
||
166 | |||
167 | |||
168 | expand_word_buffer () |
||
169 | { |
||
170 | char *new_ptr; |
||
171 | unsigned long old_length; |
||
172 | if (!input_str) |
||
173 | error_handler ("no input buffer allocated"); |
||
174 | old_length = current_max_length; |
||
175 | current_max_length *= 2; |
||
176 | new_ptr = my_malloc (current_max_length); |
||
177 | if (!new_ptr) |
||
178 | error_handler ("out of memory while resizing buffer"); |
||
179 | |||
180 | |||
181 | my_free (input_str); |
||
182 | input_str = new_ptr; |
||
183 | return TRUE; |
||
184 | } |
||
185 | |||
186 | |||
187 | |||
188 | |||
189 | |||
190 | * Name: read_word |
||
191 | * Purpose: The core of the parser, this reads a word. |
||
192 | * Args: Input file. |
||
193 | * Returns: Number of characters in the word, or zero. |
||
194 | * Note: The word buffer is static and local to this file. |
||
195 | *=======================================================================*/ |
||
196 | |||
197 | |||
198 | read_word (FILE *f) |
||
199 | { |
||
200 | int ch, ch2, ix=0; |
||
201 | int have_whitespace=FALSE; |
||
202 | int is_control_word=FALSE; |
||
203 | int has_numeric_param=FALSE; /* if is_control_word==TRUE */ |
||
204 | int need_unget=FALSE; |
||
205 | |||
206 | |||
207 | |||
208 | |||
209 | |||
210 | |||
211 | */ |
||
212 | input_str = my_malloc (current_max_length); |
||
213 | if (!input_str) |
||
214 | error_handler("cannot allocate word storage"); |
||
215 | |||
216 | |||
217 | ch = my_getchar(f); |
||
218 | } |
||
219 | while (ch=='\n'); |
||
220 | |||
221 | |||
222 | { |
||
223 | /* Compress multiple space chars down to one. |
||
224 | */ |
||
225 | while (ch == ' ') { |
||
226 | ch = my_getchar(f); |
||
227 | have_whitespace=TRUE; |
||
228 | } |
||
229 | if (have_whitespace) { |
||
230 | my_unget_char (ch); |
||
231 | input_str[0]=' '; |
||
232 | input_str[1]=0; |
||
233 | return 1; |
||
234 | } |
||
235 | } |
||
236 | |||
237 | |||
238 | { |
||
239 | case EOF: |
||
240 | return 0; |
||
241 | |||
242 | |||
243 | ch2 = my_getchar(f); |
||
244 | |||
245 | |||
246 | */ |
||
247 | switch (ch2) |
||
248 | { |
||
249 | case '\n': |
||
250 | strcpy (input_str, "\\par"); |
||
251 | return 4; |
||
252 | case '~': |
||
253 | case '{': |
||
254 | case '}': |
||
255 | case '\\': |
||
256 | case '_': |
||
257 | case '-': |
||
258 | input_str[0] = '\\'; |
||
259 | input_str[1] = ch2; |
||
260 | input_str[2] = 0; |
||
261 | return 2; |
||
262 | case '\'': |
||
263 | /* Preserve \'## expressions (hex char exprs) for later. |
||
264 | */ |
||
265 | input_str[0]='\\'; |
||
266 | input_str[1]='\''; |
||
267 | ix=2; |
||
268 | if(ix==current_max_length) { |
||
269 | if (!expand_word_buffer ()) |
||
270 | error_handler("word too long"); |
||
271 | } |
||
272 | ch = my_getchar(f); |
||
273 | input_str[ix++]=ch; |
||
274 | if(ix==current_max_length) { |
||
275 | if (!expand_word_buffer ()) |
||
276 | error_handler("word too long"); |
||
277 | } |
||
278 | ch = my_getchar(f); |
||
279 | input_str[ix++]=ch; |
||
280 | if(ix==current_max_length) { |
||
281 | if (!expand_word_buffer ()) |
||
282 | error_handler("word too long"); |
||
283 | } |
||
284 | input_str[ix]=0; |
||
285 | return ix; |
||
286 | } |
||
287 | |||
288 | |||
289 | ix=1; |
||
290 | input_str[0]=ch; |
||
291 | ch=ch2; |
||
292 | break; |
||
293 | |||
294 | |||
295 | /* In RTF, a tab char is the same as \tab. |
||
296 | */ |
||
297 | strcpy (input_str, "\\tab"); |
||
298 | return 4; |
||
299 | |||
300 | |||
301 | case '}': |
||
302 | case ';': |
||
303 | input_str[0]=ch; |
||
304 | input_str[1]=0; |
||
305 | return 1; |
||
306 | |||
307 | |||
308 | |||
309 | |||
310 | { |
||
311 | /* Several chars always ends a word, and we need to save them. |
||
312 | */ |
||
313 | if (ch=='\t' || ch=='{' || ch=='}' || ch=='\\') { |
||
314 | need_unget=TRUE; |
||
315 | break; |
||
316 | } |
||
317 | |||
318 | |||
319 | * A newline is ignored if this is not a command word. |
||
320 | */ |
||
321 | if (ch=='\n') { |
||
322 | if (is_control_word) |
||
323 | break; |
||
324 | ch = my_getchar(f); |
||
325 | continue; |
||
326 | } |
||
327 | |||
328 | |||
329 | * A semicolon never ends a regular word. |
||
330 | */ |
||
331 | if (ch==';') { |
||
332 | if (is_control_word) { |
||
333 | need_unget=TRUE; |
||
334 | break; |
||
335 | } |
||
336 | } |
||
337 | |||
338 | |||
339 | * any word, and if it does not follow a command, |
||
340 | * then it is a word in itself. |
||
341 | */ |
||
342 | if (ch==' ') { |
||
343 | if (!is_control_word) |
||
344 | need_unget=TRUE; |
||
345 | break; |
||
346 | } |
||
347 | |||
348 | |||
349 | */ |
||
350 | if (is_control_word) { |
||
351 | if (!has_numeric_param && (isdigit(ch) || ch=='-')) |
||
352 | has_numeric_param = TRUE; |
||
353 | else |
||
354 | if (has_numeric_param && !isdigit(ch)) { |
||
355 | if (ch!=' ') |
||
356 | need_unget=TRUE; |
||
357 | break; |
||
358 | } |
||
359 | } |
||
360 | |||
361 | |||
362 | if (ix==current_max_length) { |
||
363 | if (!expand_word_buffer ()) |
||
364 | error_handler("word too long"); |
||
365 | } |
||
366 | ch = my_getchar (f); |
||
367 | } |
||
368 | |||
369 | |||
370 | my_unget_char(ch); |
||
371 | |||
372 | |||
373 | return ix; |
||
374 | } |
||
375 | |||
376 | |||
377 | |||
378 | |||
379 | * Name: word_read |
||
380 | * Purpose: This is the recursive metareader which pieces together the |
||
381 | * structure of Word objects. |
||
382 | * Args: Input file. |
||
383 | * Returns: Tree of Word objects. |
||
384 | *=======================================================================*/ |
||
385 | |||
386 | |||
387 | word_read (FILE* f) { |
||
388 | Word * prev_word = NULL; |
||
389 | Word * first_word = NULL; |
||
390 | Word * new_word = NULL; /* temp */ |
||
391 | |||
392 | |||
393 | |||
394 | |||
395 | if (!read_word(f)) { |
||
396 | return first_word; |
||
397 | } |
||
398 | |||
399 | |||
400 | |||
401 | /* Process subwords */ |
||
402 | |||
403 | |||
404 | printf ("processing subword...\n"); |
||
405 | #endif |
||
406 | |||
407 | |||
408 | new_word = word_new (NULL); |
||
409 | if (!new_word) |
||
410 | error_handler ("cannot allocate word"); |
||
411 | |||
412 | |||
413 | new_word->child = word_read (f); |
||
414 | if (!new_word->hash_index && !new_word->child) |
||
415 | { |
||
416 | /* printf ("unable to read children!\n"); */ |
||
417 | } |
||
418 | |||
419 | |||
420 | #if 0 |
||
421 | printf ("returning from word_read.\n"); |
||
422 | #endif |
||
423 | return first_word; |
||
424 | } else { |
||
425 | new_word = word_new (input_str); |
||
426 | } |
||
427 | |||
428 | |||
429 | |||
430 | |||
431 | |||
432 | |||
433 | |||
434 | |||
435 | */ |
||
436 | my_free (input_str); |
||
437 | input_str = NULL; |
||
438 | } |
||
439 | while(1); |
||
440 | |||
441 | |||
442 |