WebSVN – Kolibri OS – /programs/develop/tinybasic-1.0.4/src/tokeniser.c

/*
* Tiny BASIC
* Tokenisation module
*
* Copyright (C) Damian Gareth Walker 2019
* Created: 04-Aug-2019
*/
/* included headers */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "token.h"
#include "tokeniser.h"
#include "common.h"
/*
* Data definitions
*/
/* modes of reading */
typedef enum {
DEFAULT_MODE, /* we have no idea what's coming */
COMMENT_MODE, /* reading a comment */
WORD_MODE, /* reading an identifier or keyword */
NUMBER_MODE, /* reading a numeric constant */
LESS_THAN_MODE, /* reading an operator staring with < */
GREATER_THAN_MODE, /* reading an operator starting with > */
STRING_LITERAL_MODE, /* reading a string literal */
UNKNOWN_MODE /* we are lost */
} Mode;
/* current state information */
typedef struct {
Token *token; /* token to return */
Mode mode; /* current reading mode */
int ch; /* last-read character */
char *content; /* content of token under construction */
int max; /* memory reserved for content */
} TokeniserState;
/* Private data */
typedef struct {
FILE *input; /* the input file */
int line, /* current line in the input file */
pos, /* current position on the input line */
start_line, /* line on which a token started */
start_pos; /* position on which a token started */
} Private;
/*
* File level variables
*/
/* convenience variables */
static TokenStream *this; /* token stream passed in to public method */
static Private *data; /* private data for this */
/*
* Level 2 Tokeniser Routines
*/
/*
* Read a character and update the position counter
* globals:
* int line current line after character read
* int pos current character position after character read
* params:
* TokeniserState* state current state of the tokeniser
* returns:
* int character just read
*/
static int read_character (TokeniserState *state) {
int ch; /* character read from stream */
/* read the character */
ch = fgetc (data->input);
/* update the position and line counters */
if (ch == '\n') {
++data->line;
data->pos = 0;
} else {
++data->pos;
}
/* return the character */
return ch;
}
/*
* Push a character back into the input stream and update position markers
* globals:
* int line line number rolled back
* int pos character position rolled back
* params:
* TokeniserState* state current state of the tokeniser
*/
static void unread_character (TokeniserState *state) {
ungetc (state->ch, data->input);
if (state->ch == '\n')
--data->line;
else
--data->pos;
}
/*
* Append the last read character to the token content
* params:
* TokeniserState* state current state of the tokeniser
*/
static void store_character (TokeniserState *state) {
/* variable declarations */
char *temp; /* temporary pointer to content */
int length; /* current length of token */
/* allocate more memory for the token content if necessary */
if (strlen (state->content) == state->max - 1) {
temp = state->content;
state->max *= 2;
state->content = malloc (state->max);
strcpy (state->content, temp);
free (temp);
}
/* now add the character to the token */
length = strlen (state->content);
state->content [length++] = state->ch;
state->content [length] = '\0';
}
/*
* Identify the various recognised symbols
* params:
* int ch the character to identify
* returns:
* TokenClass the token class recognised by the parser
*/
static TokenClass identify_symbol (int ch) {
switch (ch) {
case '+':
return TOKEN_PLUS;
break;
case '-':
return TOKEN_MINUS;
break;
case '*':
return TOKEN_MULTIPLY;
break;
case '/':
return TOKEN_DIVIDE;
break;
case '=':
return TOKEN_EQUAL;
break;
case '(':
return TOKEN_LEFT_PARENTHESIS;
break;
case ')':
return TOKEN_RIGHT_PARENTHESIS;
break;
case ',':
return TOKEN_COMMA;
break;
default:
return TOKEN_SYMBOL;
}
}
static TokenClass identify_word (char *word) {
if (strlen (word) == 1)
return TOKEN_VARIABLE;
else if (! tinybasic_strcmp (word, "LET"))
return TOKEN_LET;
else if (! tinybasic_strcmp (word, "IF"))
return TOKEN_IF;
else if (! tinybasic_strcmp (word, "THEN"))
return TOKEN_THEN;
else if (! tinybasic_strcmp (word, "GOTO"))
return TOKEN_GOTO;
else if (! tinybasic_strcmp (word, "GOSUB"))
return TOKEN_GOSUB;
else if (! tinybasic_strcmp (word, "RETURN"))
return TOKEN_RETURN;
else if (! tinybasic_strcmp (word, "END"))
return TOKEN_END;
else if (! tinybasic_strcmp (word, "PRINT"))
return TOKEN_PRINT;
else if (! tinybasic_strcmp (word, "INPUT"))
return TOKEN_INPUT;
else if (! tinybasic_strcmp (word, "REM"))
return TOKEN_REM;
else
return TOKEN_WORD;
}
/*
* Identify compound (multi-character) symbols.
* Also identifies some single-character symbols that can form
* the start of multi-character symbols.
* params:
* char* symbol the symbol to identify
* returns:
* TokenClass the identification
*/
static TokenClass identify_compound_symbol (char *symbol) {
if (! strcmp (symbol, "<>")
|| ! strcmp (symbol, "><"))
return TOKEN_UNEQUAL;
else if (! strcmp (symbol, "<"))
return TOKEN_LESSTHAN;
else if (! strcmp (symbol, "<="))
return TOKEN_LESSOREQUAL;
else if (! strcmp (symbol, ">"))
return TOKEN_GREATERTHAN;
else if (! strcmp (symbol, ">="))
return TOKEN_GREATEROREQUAL;
else
return TOKEN_SYMBOL;
}
/*
* Level 1 Tokeniser Routines
*/
/*
* Default mode - deal with character when state is unknown
* globals:
* int line current line in the source file
* int pos current character position in the source
* int start_line line on which the current token started
* int start_pos char pos on which the current token started
* params:
* TokeniserState* state current state of the tokeniser
*/
static void default_mode (TokeniserState *state) {
/* deal with non-EOL whitespace */
if (state->ch == ' ' ||
state->ch == '\t') {
state->ch = read_character (state);
data->start_line = data->line;
data->start_pos = data->pos;
}
/* deal with EOL whitespace */
else if (state->ch == '\n') {
data->start_line = data->line - 1;
data->start_pos = data->pos;
state->token = new_Token_init
(TOKEN_EOL, data->start_line, data->start_pos, state->content);
}
/* alphabetic characters start a word */
else if ((state->ch >= 'A' && state->ch <= 'Z') ||
(state->ch >= 'a' && state->ch <= 'z')) {
data->start_line = data->line;
data->start_pos = data->pos;
state->mode = WORD_MODE;
}
/* digits start a number */
else if (state->ch >= '0' && state->ch <= '9')
state->mode = NUMBER_MODE;
/* check for tokens starting with less-than (<, <=, <>) */
else if (state->ch == '<') {
data->start_line = data->line;
data->start_pos = data->pos;
store_character (state);
state->ch = read_character (state);
state->mode = LESS_THAN_MODE;
}
/* check for tokens starting with greater-than (>, >=) */
else if (state->ch == '>') {
data->start_line = data->line;
data->start_pos = data->pos;
store_character (state);
state->ch = read_character (state);
state->mode = GREATER_THAN_MODE;
}
/* deal with other symbol operators */
else if (strchr ("+-*/=(),", state->ch) != NULL) {
data->start_line = data->line;
data->start_pos = data->pos;
store_character (state);
state->token = new_Token_init (identify_symbol (state->ch),
data->start_line, data->start_pos, state->content);
}
/* double quotes start a string literal */
else if (state->ch == '"') {
data->start_line = data->line;
data->start_pos = data->pos;
state->ch = read_character (state);
state->mode = STRING_LITERAL_MODE;
}
/* detect end of file */
else if (state->ch == EOF) {
data->start_line = data->line;
data->start_pos = data->pos;
state->token = new_Token_init
(TOKEN_EOF, data->start_line, data->start_pos, state->content);
}
/* other characters are illegal */
else {
data->start_line = data->line;
data->start_pos = data->pos;
store_character (state);
state->token = new_Token_init
(TOKEN_ILLEGAL, data->start_line, data->start_pos, state->content);
}
}
/*
* Word mode - deal with character when building a word token
* globals:
* int start_line line on which the current token started
* int start_pos char pos on which the current token started
* params:
* TokeniserState* state current state of the tokeniser
*/
static void word_mode (TokeniserState *state) {
/* local variables */
TokenClass class; /* recognised class of keyword */
/* add letters and digits to the token */
if ((state->ch >= 'A' && state->ch <= 'Z') ||
(state->ch >= 'a' && state->ch <= 'z')) {
store_character (state);
state->ch = read_character (state);
}
/* other characters are pushed back for the next token */
else {
if (state->ch != EOF)
unread_character (state);
class = identify_word (state->content);
if (class == TOKEN_REM) {
*state->content = '\0';
state->mode = COMMENT_MODE;
}
else
state->token = new_Token_init
(class, data->start_line, data->start_pos, state->content);
}
}
/*
* Comment mode - skip till end of line after a REM
* globals:
* int start_line line on which the current token started
* int start_pos char pos on which the current token started
* params:
* TokeniserState* state current state of the tokeniser
*/
static void comment_mode (TokeniserState *state) {
if (state->ch == '\n')
state->mode = DEFAULT_MODE;
else
state->ch = read_character (state);
}
/*
* Number mode - building a number token (integer only)
* globals:
* int start_line line on which the current token started
* int start_pos char pos on which the current token started
* params:
* TokeniserState* state current state of the tokeniser
*/
static void number_mode (TokeniserState *state) {
/* add digits to the token */
if (state->ch >= '0' && state->ch <= '9') {
store_character (state);
state->ch = read_character (state);
}
/* other characters are pushed back for the next token */
else {
if (state->ch != EOF)
unread_character (state);
state->token = new_Token_init
(TOKEN_NUMBER, data->start_line, data->start_pos, state->content);
}
}
/*
* Less than mode - checking for <> and <= operators
* globals:
* int start_line line on which the current token started
* int start_pos char pos on which the current token started
* params:
* TokeniserState* state current state of the tokeniser
*/
static void less_than_mode (TokeniserState *state) {
if (state->ch == '=' || state->ch == '>')
store_character (state);
else
unread_character (state);
state->token = new_Token_init
(identify_compound_symbol (state->content), data->start_line,
data->start_pos, state->content);
}
/*
* Greater than mode - checking for >= and >< operators
* globals:
* int start_line line on which the current token started
* int start_pos char pos on which the current token started
* params:
* TokeniserState* state current state of the tokeniser
*/
static void greater_than_mode (TokeniserState *state) {
if (state->ch == '=' || state->ch == '<')
store_character (state);
else
ungetc (state->ch, data->input);
state->token = new_Token_init
(identify_compound_symbol (state->content), data->start_line,
data->start_pos, state->content);
}
/*
* String literal mode - reading a string
* globals:
* int start_line line on which the current token started
* int start_pos char pos on which the current token started
* params:
* TokeniserState* state current state of the tokeniser
*/
static void string_literal_mode (TokeniserState *state) {
/* a quote terminates the string */
if (state->ch == '"')
state->token = new_Token_init
(TOKEN_STRING, data->start_line, data->start_pos, state->content);
/* a backslash escapes the next character */
else if (state->ch == '\\') {
state->ch = read_character (state);
store_character (state);
state->ch = read_character (state);
}
/* EOF generates an error */
else if (state->ch == EOF)
state->token = new_Token_init
(TOKEN_ILLEGAL, data->start_line, data->start_pos, state->content);
/* all other characters are part of the string */
else {
store_character (state);
state->ch = read_character (state);
}
}
/*
* Top Level Tokeniser Routines
*/
/*
* Get the next token
* params:
* TokenStream* token_stream the token stream being processed
* returns:
* Token* the token built
*/
static Token *next (TokenStream *token_stream) {
/* local variables */
TokeniserState state; /* current state of reading */
Token *return_token; /* token to return */
/* initialise */
this = token_stream;
data = this->data;
state.token = NULL;
state.mode = DEFAULT_MODE;
state.max = 1024;
state.content = malloc (state.max);
*(state.content) = '\0';
state.ch = read_character (&state);
/* main loop */
while (state.token == NULL) {
switch (state.mode) {
case DEFAULT_MODE:
default_mode (&state);
break;
case COMMENT_MODE:
comment_mode (&state);
break;
case WORD_MODE:
word_mode (&state);
break;
case NUMBER_MODE:
number_mode (&state);
break;
case LESS_THAN_MODE:
less_than_mode (&state);
break;
case GREATER_THAN_MODE:
greater_than_mode (&state);
break;
case STRING_LITERAL_MODE:
string_literal_mode (&state);
break;
default:
state.token = new_Token_init
(TOKEN_EOF, data->start_line, data->start_pos, state.content);
state.ch = EOF; /* temporary hack */
}
}
/* store token and release state memory */
return_token = state.token;
free (state.content);
/* return result */
return return_token;
}
/*
* Getter for the current line number
* paramss:
* TokenStream* token_stream the token stream being processed
* returns:
* int the current line number returned
*/
static int get_line (TokenStream *token_stream) {
this = token_stream;
data = this->data;
return data->line;
}
/*
* Destructor for a TokenStream
* params:
* TokenStream* token_stream the doomed token stream
*/
static void destroy (TokenStream *token_stream) {
if (token_stream) {
if (token_stream->data)
free (token_stream->data);
free (token_stream);
}
}
/*
* Constructors
*/
/*
* Constructor for TokenStream
* params:
* FILE* input Input file
* returns:
* TokenStream* The new token stream
*/
TokenStream *new_TokenStream (FILE *input) {
/* allocate the memory */
this = malloc (sizeof (TokenStream));
this->data = data = malloc (sizeof (Private));
/* initialise methods */
this->next = next;
this->get_line = get_line;
this->destroy = destroy;
/* initialise data */
data->input = input;
data->line = data->start_line = 1;
data->pos = data->start_pos = 0;
/* return new token stream */
return this;
}

Subversion Repositories Kolibri OS

(root)/programs/develop/tinybasic-1.0.4/src/tokeniser.c – Rev 8733