Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
8335 maxcodehac 1
 
2
   GNU UnRTF, a command-line program to convert RTF documents to other formats.
3
   Copyright (C) 2000,2001 Zachary Thayer Smith
4
5
 
6
   it under the terms of the GNU General Public License as published by
7
   the Free Software Foundation; either version 2 of the License, or
8
   (at your option) any later version.
9
10
 
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU General Public License for more details.
14
15
 
16
   along with this program; if not, write to the Free Software
17
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
19
 
20
=============================================================================*/
21
22
 
23
 
24
 * Module name:    parse
25
 * Author name:    Zach Smith
26
 * Create date:    01 Sep 00
27
 * Purpose:        Parsing of the RTF file into a structure of Word objects.
28
 *----------------------------------------------------------------------
29
 * Changes:
30
 * 15 Oct 00, tuorfa@yahoo.com: parse.c created with functions taken from word.c
31
 * 15 Oct 00, tuorfa@yahoo.com: backslash before newline is now \par
32
 * 08 Apr 01, tuorfa@yahoo.com: removed limit on word length
33
 * 03 Aug 01, tuorfa@yahoo.com: added input buffering
34
 * 19 Sep 01, tuorfa@yahoo.com: cleaned up read_word()
35
 * 22 Sep 01, tuorfa@yahoo.com: moved word_dump() to word.c
36
 * 22 Sep 01, tuorfa@yahoo.com: added function-level comment blocks
37
 *--------------------------------------------------------------------*/
38
39
 
40
#include 
41
#include 
42
#include 
43
44
 
45
#include "parse.h"
46
#include "malloc.h"
47
#include "main.h"
48
#include "error.h"
49
#include "word.h"
50
#include "hash.h"
51
52
 
53
 
54
 
55
static int ungot_char=-1;
56
static int ungot_char2=-1;
57
static int ungot_char3=-1;
58
59
 
60
 
61
 
62
 * Name:	my_unget_char
63
 * Purpose:	My own unget routine, handling up to 3 ungot characters.
64
 * Args:	Character.
65
 * Returns:	None.
66
 *=======================================================================*/
67
68
 
69
{
70
	if (ungot_char>=0 && ungot_char2>=0 && ungot_char3>=0)
71
		error_handler ("more than 3 ungot chars");
72
73
 
74
	ungot_char2 = ungot_char;
75
	ungot_char = ch;
76
}
77
78
 
79
 
80
81
 
82
 
83
static int buffer_size = 0;
84
static char *read_buf = NULL;
85
static int read_buf_end = 0;
86
static int read_buf_index = 0;
87
88
 
89
 
90
 
91
 
92
 
93
 * Name:	my_getchar
94
 * Purpose:	Gets a character: either an ungot one, or a buffered one.
95
 * Args:	Input file.
96
 * Returns:	Character, or EOF.
97
 *=======================================================================*/
98
99
 
100
{
101
	int ch;
102
103
 
104
105
 
106
		ch = ungot_char;
107
		ungot_char=ungot_char2;
108
		ungot_char2=ungot_char3;
109
		ungot_char3=-1;
110
		last_returned_ch = ch;
111
		return ch;
112
	}
113
	do {
114
		if (read_buf_index >= read_buf_end) {
115
			if (!read_buf) {
116
				buffer_size = READ_BUF_LEN;
117
				read_buf = my_malloc (buffer_size);
118
				if (!read_buf) {
119
					buffer_size /= 4;
120
					read_buf = my_malloc (buffer_size);
121
					if (!read_buf)
122
						error_handler ("cannot allocate read buffer");
123
				}
124
			}
125
			read_buf_end = fread (read_buf, 1, buffer_size, f);
126
			read_buf_index = 0;
127
			if (!read_buf_end)
128
				return EOF;
129
		}
130
		ch = read_buf [read_buf_index++];
131
132
 
133
			lineno++;
134
			/* Convert \(newline) into \par here */
135
			if (last_returned_ch=='\\') {
136
				my_unget_char (' ');
137
				my_unget_char ('r');
138
				my_unget_char ('a');
139
				ch = 'p';
140
				break;
141
			}
142
		}
143
	}
144
	while (ch=='\r' /* || ch=='\n' */ );
145
146
 
147
148
 
149
	return ch;
150
}
151
152
 
153
 
154
static char *input_str = NULL;
155
static unsigned long current_max_length = 1;
156
157
 
158
 
159
 
160
 * Name:	expand_word_buffer
161
 * Purpose:	Expands the buffer used to store an incoming word.
162
 *		This allows us to remove the limit on word length.
163
 * Args:	None.
164
 * Returns:	None.
165
 *=======================================================================*/
166
167
 
168
expand_word_buffer ()
169
{
170
	char *new_ptr;
171
	unsigned long old_length;
172
	if (!input_str)
173
		error_handler ("no input buffer allocated");
174
	old_length = current_max_length;
175
	current_max_length *= 2;
176
	new_ptr = my_malloc (current_max_length);
177
	if (!new_ptr)
178
		error_handler ("out of memory while resizing buffer");
179
180
 
181
	my_free (input_str);
182
	input_str = new_ptr;
183
	return TRUE;
184
}
185
186
 
187
 
188
 
189
 
190
 * Name:	read_word
191
 * Purpose:	The core of the parser, this reads a word.
192
 * Args:	Input file.
193
 * Returns:	Number of characters in the word, or zero.
194
 * Note:	The word buffer is static and local to this file.
195
 *=======================================================================*/
196
197
 
198
read_word (FILE *f)
199
{
200
	int ch, ch2, ix=0;
201
	int have_whitespace=FALSE;
202
	int is_control_word=FALSE;
203
	int has_numeric_param=FALSE; /* if is_control_word==TRUE */
204
	int need_unget=FALSE;
205
206
 
207
208
 
209
210
 
211
	 */
212
	input_str = my_malloc (current_max_length);
213
	if (!input_str)
214
		error_handler("cannot allocate word storage");
215
216
 
217
		ch = my_getchar(f);
218
	}
219
	while (ch=='\n');
220
221
 
222
	{
223
		/* Compress multiple space chars down to one.
224
		 */
225
		while (ch == ' ') {
226
			ch = my_getchar(f);
227
			have_whitespace=TRUE;
228
		}
229
		if (have_whitespace) {
230
			my_unget_char (ch);
231
			input_str[0]=' ';
232
			input_str[1]=0;
233
			return 1;
234
		}
235
	}
236
237
 
238
	{
239
	case EOF:
240
		return 0;
241
242
 
243
		ch2 = my_getchar(f);
244
245
 
246
		 */
247
		switch (ch2)
248
		{
249
		case '\n':
250
			strcpy (input_str, "\\par");
251
			return 4;
252
		case '~':
253
		case '{':
254
		case '}':
255
		case '\\':
256
		case '_':
257
		case '-':
258
			input_str[0] = '\\';
259
			input_str[1] = ch2;
260
			input_str[2] = 0;
261
			return 2;
262
		case '\'':
263
			/* Preserve \'## expressions (hex char exprs) for later.
264
			 */
265
			input_str[0]='\\';
266
			input_str[1]='\'';
267
			ix=2;
268
			if(ix==current_max_length) {
269
				if (!expand_word_buffer ())
270
					error_handler("word too long");
271
			}
272
			ch = my_getchar(f);
273
			input_str[ix++]=ch;
274
			if(ix==current_max_length) {
275
				if (!expand_word_buffer ())
276
					error_handler("word too long");
277
			}
278
			ch = my_getchar(f);
279
			input_str[ix++]=ch;
280
			if(ix==current_max_length) {
281
				if (!expand_word_buffer ())
282
					error_handler("word too long");
283
			}
284
			input_str[ix]=0;
285
			return ix;
286
		}
287
288
 
289
		ix=1;
290
		input_str[0]=ch;
291
		ch=ch2;
292
		break;
293
294
 
295
		/* In RTF, a tab char is the same as \tab.
296
		 */
297
		strcpy (input_str, "\\tab");
298
		return 4;
299
300
 
301
	case '}':
302
	case ';':
303
		input_str[0]=ch;
304
		input_str[1]=0;
305
		return 1;
306
307
 
308
309
 
310
	{
311
		/* Several chars always ends a word, and we need to save them.
312
		 */
313
		if (ch=='\t' || ch=='{' || ch=='}' || ch=='\\') {
314
			need_unget=TRUE;
315
			break;
316
		}
317
318
 
319
		 * A newline is ignored if this is not a command word.
320
		 */
321
		if (ch=='\n') {
322
			if (is_control_word)
323
				break;
324
			ch = my_getchar(f);
325
			continue;
326
		}
327
328
 
329
		 * A semicolon never ends a regular word.
330
		 */
331
		if (ch==';') {
332
			if (is_control_word) {
333
				need_unget=TRUE;
334
				break;
335
			}
336
		}
337
338
 
339
		 * any word, and if it does not follow a command,
340
		 * then it is a word in itself.
341
		 */
342
		if (ch==' ') {
343
			if (!is_control_word)
344
				need_unget=TRUE;
345
			break;
346
		}
347
348
 
349
		 */
350
		if (is_control_word) {
351
			if (!has_numeric_param && (isdigit(ch) || ch=='-'))
352
				has_numeric_param = TRUE;
353
			else
354
			if (has_numeric_param && !isdigit(ch)) {
355
				if (ch!=' ')
356
					need_unget=TRUE;
357
				break;
358
			}
359
		}
360
361
 
362
		if (ix==current_max_length) {
363
			if (!expand_word_buffer ())
364
				error_handler("word too long");
365
		}
366
		ch = my_getchar (f);
367
	}
368
369
 
370
		my_unget_char(ch);
371
372
 
373
	return ix;
374
}
375
376
 
377
 
378
 
379
 * Name:	word_read
380
 * Purpose:	This is the recursive metareader which pieces together the
381
 *		structure of Word objects.
382
 * Args:	Input file.
383
 * Returns:	Tree of Word objects.
384
 *=======================================================================*/
385
386
 
387
word_read (FILE* f) {
388
	Word * prev_word = NULL;
389
	Word * first_word = NULL;
390
	Word * new_word = NULL; /* temp */
391
392
 
393
394
 
395
		if (!read_word(f)) {
396
			return first_word;
397
		}
398
399
 
400
 
401
			/* Process subwords */
402
403
 
404
printf ("processing subword...\n");
405
#endif
406
407
 
408
			new_word = word_new (NULL);
409
			if (!new_word)
410
				error_handler ("cannot allocate word");
411
412
 
413
			new_word->child = word_read (f);
414
			if (!new_word->hash_index && !new_word->child)
415
			{
416
				/* printf ("unable to read children!\n"); */
417
			}
418
419
 
420
#if 0
421
printf ("returning from word_read.\n");
422
#endif
423
			return first_word;
424
		} else {
425
			new_word = word_new (input_str);
426
		}
427
428
 
429
430
 
431
432
 
433
434
 
435
		 */
436
		my_free (input_str);
437
		input_str = NULL;
438
	}
439
	while(1);
440
441
 
442