WebSVN – Kolibri OS – Blame – /programs/media/unrtf/parse.c

Rev	Author	Line No.	Line
8335	maxcodehac	1
		2	GNU UnRTF, a command-line program to convert RTF documents to other formats.
		3	Copyright (C) 2000,2001 Zachary Thayer Smith
		4
		5
		6	it under the terms of the GNU General Public License as published by
		7	the Free Software Foundation; either version 2 of the License, or
		8	(at your option) any later version.
		9
		10
		11	but WITHOUT ANY WARRANTY; without even the implied warranty of
		12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		13	GNU General Public License for more details.
		14
		15
		16	along with this program; if not, write to the Free Software
		17	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
		18
		19
		20	=============================================================================*/
		21
		22
		23
		24	* Module name: parse
		25	* Author name: Zach Smith
		26	* Create date: 01 Sep 00
		27	* Purpose: Parsing of the RTF file into a structure of Word objects.
		28	*----------------------------------------------------------------------
		29	* Changes:
		30	* 15 Oct 00, tuorfa@yahoo.com: parse.c created with functions taken from word.c
		31	* 15 Oct 00, tuorfa@yahoo.com: backslash before newline is now \par
		32	* 08 Apr 01, tuorfa@yahoo.com: removed limit on word length
		33	* 03 Aug 01, tuorfa@yahoo.com: added input buffering
		34	* 19 Sep 01, tuorfa@yahoo.com: cleaned up read_word()
		35	* 22 Sep 01, tuorfa@yahoo.com: moved word_dump() to word.c
		36	* 22 Sep 01, tuorfa@yahoo.com: added function-level comment blocks
		37	--------------------------------------------------------------------/
		38
		39
		40	#include
		41	#include
		42	#include
		43
		44
		45	#include "parse.h"
		46	#include "malloc.h"
		47	#include "main.h"
		48	#include "error.h"
		49	#include "word.h"
		50	#include "hash.h"
		51
		52
		53
		54
		55	static int ungot_char=-1;
		56	static int ungot_char2=-1;
		57	static int ungot_char3=-1;
		58
		59
		60
		61
		62	* Name: my_unget_char
		63	* Purpose: My own unget routine, handling up to 3 ungot characters.
		64	* Args: Character.
		65	* Returns: None.
		66	=======================================================================/
		67
		68
		69	{
		70	if (ungot_char>=0 && ungot_char2>=0 && ungot_char3>=0)
		71	error_handler ("more than 3 ungot chars");
		72
		73
		74	ungot_char2 = ungot_char;
		75	ungot_char = ch;
		76	}
		77
		78
		79
		80
		81
		82
		83	static int buffer_size = 0;
		84	static char *read_buf = NULL;
		85	static int read_buf_end = 0;
		86	static int read_buf_index = 0;
		87
		88
		89
		90
		91
		92
		93	* Name: my_getchar
		94	* Purpose: Gets a character: either an ungot one, or a buffered one.
		95	* Args: Input file.
		96	* Returns: Character, or EOF.
		97	=======================================================================/
		98
		99
		100	{
		101	int ch;
		102
		103
		104
		105
		106	ch = ungot_char;
		107	ungot_char=ungot_char2;
		108	ungot_char2=ungot_char3;
		109	ungot_char3=-1;
		110	last_returned_ch = ch;
		111	return ch;
		112	}
		113	do {
		114	if (read_buf_index >= read_buf_end) {
		115	if (!read_buf) {
		116	buffer_size = READ_BUF_LEN;
		117	read_buf = my_malloc (buffer_size);
		118	if (!read_buf) {
		119	buffer_size /= 4;
		120	read_buf = my_malloc (buffer_size);
		121	if (!read_buf)
		122	error_handler ("cannot allocate read buffer");
		123	}
		124	}
		125	read_buf_end = fread (read_buf, 1, buffer_size, f);
		126	read_buf_index = 0;
		127	if (!read_buf_end)
		128	return EOF;
		129	}
		130	ch = read_buf [read_buf_index++];
		131
		132
		133	lineno++;
		134	/* Convert \(newline) into \par here */
		135	if (last_returned_ch=='\\') {
		136	my_unget_char (' ');
		137	my_unget_char ('r');
		138	my_unget_char ('a');
		139	ch = 'p';
		140	break;
		141	}
		142	}
		143	}
		144	while (ch=='\r' /* \|\| ch=='\n' */ );
		145
		146
		147
		148
		149	return ch;
		150	}
		151
		152
		153
		154	static char *input_str = NULL;
		155	static unsigned long current_max_length = 1;
		156
		157
		158
		159
		160	* Name: expand_word_buffer
		161	* Purpose: Expands the buffer used to store an incoming word.
		162	* This allows us to remove the limit on word length.
		163	* Args: None.
		164	* Returns: None.
		165	=======================================================================/
		166
		167
		168	expand_word_buffer ()
		169	{
		170	char *new_ptr;
		171	unsigned long old_length;
		172	if (!input_str)
		173	error_handler ("no input buffer allocated");
		174	old_length = current_max_length;
		175	current_max_length *= 2;
		176	new_ptr = my_malloc (current_max_length);
		177	if (!new_ptr)
		178	error_handler ("out of memory while resizing buffer");
		179
		180
		181	my_free (input_str);
		182	input_str = new_ptr;
		183	return TRUE;
		184	}
		185
		186
		187
		188
		189
		190	* Name: read_word
		191	* Purpose: The core of the parser, this reads a word.
		192	* Args: Input file.
		193	* Returns: Number of characters in the word, or zero.
		194	* Note: The word buffer is static and local to this file.
		195	=======================================================================/
		196
		197
		198	read_word (FILE *f)
		199	{
		200	int ch, ch2, ix=0;
		201	int have_whitespace=FALSE;
		202	int is_control_word=FALSE;
		203	int has_numeric_param=FALSE; /* if is_control_word==TRUE */
		204	int need_unget=FALSE;
		205
		206
		207
		208
		209
		210
		211	*/
		212	input_str = my_malloc (current_max_length);
		213	if (!input_str)
		214	error_handler("cannot allocate word storage");
		215
		216
		217	ch = my_getchar(f);
		218	}
		219	while (ch=='\n');
		220
		221
		222	{
		223	/* Compress multiple space chars down to one.
		224	*/
		225	while (ch == ' ') {
		226	ch = my_getchar(f);
		227	have_whitespace=TRUE;
		228	}
		229	if (have_whitespace) {
		230	my_unget_char (ch);
		231	input_str[0]=' ';
		232	input_str[1]=0;
		233	return 1;
		234	}
		235	}
		236
		237
		238	{
		239	case EOF:
		240	return 0;
		241
		242
		243	ch2 = my_getchar(f);
		244
		245
		246	*/
		247	switch (ch2)
		248	{
		249	case '\n':
		250	strcpy (input_str, "\\par");
		251	return 4;
		252	case '~':
		253	case '{':
		254	case '}':
		255	case '\\':
		256	case '_':
		257	case '-':
		258	input_str[0] = '\\';
		259	input_str[1] = ch2;
		260	input_str[2] = 0;
		261	return 2;
		262	case '\'':
		263	/* Preserve \'## expressions (hex char exprs) for later.
		264	*/
		265	input_str[0]='\\';
		266	input_str[1]='\'';
		267	ix=2;
		268	if(ix==current_max_length) {
		269	if (!expand_word_buffer ())
		270	error_handler("word too long");
		271	}
		272	ch = my_getchar(f);
		273	input_str[ix++]=ch;
		274	if(ix==current_max_length) {
		275	if (!expand_word_buffer ())
		276	error_handler("word too long");
		277	}
		278	ch = my_getchar(f);
		279	input_str[ix++]=ch;
		280	if(ix==current_max_length) {
		281	if (!expand_word_buffer ())
		282	error_handler("word too long");
		283	}
		284	input_str[ix]=0;
		285	return ix;
		286	}
		287
		288
		289	ix=1;
		290	input_str[0]=ch;
		291	ch=ch2;
		292	break;
		293
		294
		295	/* In RTF, a tab char is the same as \tab.
		296	*/
		297	strcpy (input_str, "\\tab");
		298	return 4;
		299
		300
		301	case '}':
		302	case ';':
		303	input_str[0]=ch;
		304	input_str[1]=0;
		305	return 1;
		306
		307
		308
		309
		310	{
		311	/* Several chars always ends a word, and we need to save them.
		312	*/
		313	if (ch=='\t' \|\| ch=='{' \|\| ch=='}' \|\| ch=='\\') {
		314	need_unget=TRUE;
		315	break;
		316	}
		317
		318
		319	* A newline is ignored if this is not a command word.
		320	*/
		321	if (ch=='\n') {
		322	if (is_control_word)
		323	break;
		324	ch = my_getchar(f);
		325	continue;
		326	}
		327
		328
		329	* A semicolon never ends a regular word.
		330	*/
		331	if (ch==';') {
		332	if (is_control_word) {
		333	need_unget=TRUE;
		334	break;
		335	}
		336	}
		337
		338
		339	* any word, and if it does not follow a command,
		340	* then it is a word in itself.
		341	*/
		342	if (ch==' ') {
		343	if (!is_control_word)
		344	need_unget=TRUE;
		345	break;
		346	}
		347
		348
		349	*/
		350	if (is_control_word) {
		351	if (!has_numeric_param && (isdigit(ch) \|\| ch=='-'))
		352	has_numeric_param = TRUE;
		353	else
		354	if (has_numeric_param && !isdigit(ch)) {
		355	if (ch!=' ')
		356	need_unget=TRUE;
		357	break;
		358	}
		359	}
		360
		361
		362	if (ix==current_max_length) {
		363	if (!expand_word_buffer ())
		364	error_handler("word too long");
		365	}
		366	ch = my_getchar (f);
		367	}
		368
		369
		370	my_unget_char(ch);
		371
		372
		373	return ix;
		374	}
		375
		376
		377
		378
		379	* Name: word_read
		380	* Purpose: This is the recursive metareader which pieces together the
		381	* structure of Word objects.
		382	* Args: Input file.
		383	* Returns: Tree of Word objects.
		384	=======================================================================/
		385
		386
		387	word_read (FILE* f) {
		388	Word * prev_word = NULL;
		389	Word * first_word = NULL;
		390	Word * new_word = NULL; /* temp */
		391
		392
		393
		394
		395	if (!read_word(f)) {
		396	return first_word;
		397	}
		398
		399
		400
		401	/* Process subwords */
		402
		403
		404	printf ("processing subword...\n");
		405	#endif
		406
		407
		408	new_word = word_new (NULL);
		409	if (!new_word)
		410	error_handler ("cannot allocate word");
		411
		412
		413	new_word->child = word_read (f);
		414	if (!new_word->hash_index && !new_word->child)
		415	{
		416	/* printf ("unable to read children!\n"); */
		417	}
		418
		419
		420	#if 0
		421	printf ("returning from word_read.\n");
		422	#endif
		423	return first_word;
		424	} else {
		425	new_word = word_new (input_str);
		426	}
		427
		428
		429
		430
		431
		432
		433
		434
		435	*/
		436	my_free (input_str);
		437	input_str = NULL;
		438	}
		439	while(1);
		440
		441
		442

Subversion Repositories Kolibri OS

(root)/programs/media/unrtf/parse.c @ 8872 – Rev 8335