Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * Tiny BASIC
  3.  * Tokenisation module
  4.  *
  5.  * Copyright (C) Damian Gareth Walker 2019
  6.  * Created: 04-Aug-2019
  7.  */
  8.  
  9.  
  10. /* included headers */
  11. #include <stdio.h>
  12. #include <stdlib.h>
  13. #include <string.h>
  14. #include "token.h"
  15. #include "tokeniser.h"
  16. #include "common.h"
  17.  
  18.  
  19. /*
  20.  * Data definitions
  21.  */
  22.  
  23.  
  24. /* modes of reading */
  25. typedef enum {
  26.   DEFAULT_MODE, /* we have no idea what's coming */
  27.   COMMENT_MODE, /* reading a comment */
  28.   WORD_MODE, /* reading an identifier or keyword */
  29.   NUMBER_MODE, /* reading a numeric constant */
  30.   LESS_THAN_MODE, /* reading an operator staring with < */
  31.   GREATER_THAN_MODE, /* reading an operator starting with > */
  32.   STRING_LITERAL_MODE, /* reading a string literal */
  33.   UNKNOWN_MODE /* we are lost */
  34. } Mode;
  35.  
  36. /* current state information */
  37. typedef struct {
  38.   Token *token; /* token to return */
  39.   Mode mode; /* current reading mode */
  40.   int ch; /* last-read character */
  41.   char *content; /* content of token under construction */
  42.   int max; /* memory reserved for content */
  43. } TokeniserState;
  44.  
  45. /* Private data */
  46. typedef struct {
  47.   FILE *input; /* the input file */
  48.   int line, /* current line in the input file */
  49.     pos, /* current position on the input line */
  50.     start_line, /* line on which a token started */
  51.     start_pos; /* position on which a token started */
  52. } Private;
  53.  
  54.  
  55. /*
  56.  * File level variables
  57.  */
  58.  
  59.  
  60. /* convenience variables */
  61. static TokenStream *this; /* token stream passed in to public method */
  62. static Private *data; /* private data for this */
  63.  
  64.  
  65. /*
  66.  * Level 2 Tokeniser Routines
  67.  */
  68.  
  69.  
  70. /*
  71.  * Read a character and update the position counter
  72.  * globals:
  73.  *   int               line    current line after character read
  74.  *   int               pos     current character position after character read
  75.  * params:
  76.  *   TokeniserState*   state   current state of the tokeniser
  77.  * returns:
  78.  *   int              character just read
  79.  */
  80. static int read_character (TokeniserState *state) {
  81.  
  82.   int ch; /* character read from stream */
  83.  
  84.   /* read the character */
  85.   ch = fgetc (data->input);
  86.  
  87.   /* update the position and line counters */
  88.   if (ch == '\n') {
  89.     ++data->line;
  90.     data->pos = 0;
  91.   } else {
  92.     ++data->pos;
  93.   }
  94.  
  95.   /* return the character */
  96.   return ch;
  97. }
  98.  
  99. /*
  100.  * Push a character back into the input stream and update position markers
  101.  * globals:
  102.  *   int               line    line number rolled back
  103.  *   int               pos     character position rolled back
  104.  * params:
  105.  *   TokeniserState*   state   current state of the tokeniser
  106.  */
  107. static void unread_character (TokeniserState *state) {
  108.   ungetc (state->ch, data->input);
  109.   if (state->ch == '\n')
  110.     --data->line;
  111.   else
  112.     --data->pos;
  113. }
  114.  
  115. /*
  116.  * Append the last read character to the token content
  117.  * params:
  118.  *   TokeniserState*   state   current state of the tokeniser
  119.  */
  120. static void store_character (TokeniserState *state) {
  121.  
  122.   /* variable declarations */
  123.   char *temp; /* temporary pointer to content */
  124.   int length; /* current length of token */
  125.  
  126.   /* allocate more memory for the token content if necessary */
  127.   if (strlen (state->content) == state->max - 1) {
  128.     temp = state->content;
  129.     state->max *= 2;
  130.     state->content = malloc (state->max);
  131.     strcpy (state->content, temp);
  132.     free (temp);
  133.   }
  134.  
  135.   /* now add the character to the token */
  136.   length = strlen (state->content);
  137.   state->content [length++] = state->ch;
  138.   state->content [length] = '\0';
  139. }
  140.  
  141. /*
  142.  * Identify the various recognised symbols
  143.  * params:
  144.  *   int   ch     the character to identify
  145.  * returns:
  146.  *   TokenClass   the token class recognised by the parser
  147.  */
  148. static TokenClass identify_symbol (int ch) {
  149.   switch (ch) {
  150.   case '+':
  151.     return TOKEN_PLUS;
  152.     break;
  153.   case '-':
  154.     return TOKEN_MINUS;
  155.     break;
  156.   case '*':
  157.     return TOKEN_MULTIPLY;
  158.     break;
  159.   case '/':
  160.     return TOKEN_DIVIDE;
  161.     break;
  162.   case '=':
  163.     return TOKEN_EQUAL;
  164.     break;
  165.   case '(':
  166.     return TOKEN_LEFT_PARENTHESIS;
  167.     break;
  168.   case ')':
  169.     return TOKEN_RIGHT_PARENTHESIS;
  170.     break;
  171.   case ',':
  172.     return TOKEN_COMMA;
  173.     break;
  174.   default:
  175.     return TOKEN_SYMBOL;
  176.   }
  177. }
  178.  
  179. static TokenClass identify_word (char *word) {
  180.   if (strlen (word) == 1)
  181.     return TOKEN_VARIABLE;
  182.   else if (! tinybasic_strcmp (word, "LET"))
  183.     return TOKEN_LET;
  184.   else if (! tinybasic_strcmp (word, "IF"))
  185.     return TOKEN_IF;
  186.   else if (! tinybasic_strcmp (word, "THEN"))
  187.     return TOKEN_THEN;
  188.   else if (! tinybasic_strcmp (word, "GOTO"))
  189.     return TOKEN_GOTO;
  190.   else if (! tinybasic_strcmp (word, "GOSUB"))
  191.     return TOKEN_GOSUB;
  192.   else if (! tinybasic_strcmp (word, "RETURN"))
  193.     return TOKEN_RETURN;
  194.   else if (! tinybasic_strcmp (word, "END"))
  195.     return TOKEN_END;
  196.   else if (! tinybasic_strcmp (word, "PRINT"))
  197.     return TOKEN_PRINT;
  198.   else if (! tinybasic_strcmp (word, "INPUT"))
  199.     return TOKEN_INPUT;
  200.   else if (! tinybasic_strcmp (word, "REM"))
  201.     return TOKEN_REM;
  202.   else
  203.     return TOKEN_WORD;
  204. }
  205.  
  206. /*
  207.  * Identify compound (multi-character) symbols.
  208.  * Also identifies some single-character symbols that can form
  209.  * the start of multi-character symbols.
  210.  * params:
  211.  *   char*   symbol   the symbol to identify
  212.  * returns:
  213.  *   TokenClass       the identification
  214.  */
  215. static TokenClass identify_compound_symbol (char *symbol) {
  216.   if (! strcmp (symbol, "<>")
  217.       || ! strcmp (symbol, "><"))
  218.     return TOKEN_UNEQUAL;
  219.   else if (! strcmp (symbol, "<"))
  220.     return TOKEN_LESSTHAN;
  221.   else if (! strcmp (symbol, "<="))
  222.     return TOKEN_LESSOREQUAL;
  223.   else if (! strcmp (symbol, ">"))
  224.     return TOKEN_GREATERTHAN;
  225.   else if (! strcmp (symbol, ">="))
  226.     return TOKEN_GREATEROREQUAL;
  227.   else
  228.     return TOKEN_SYMBOL;
  229. }
  230.  
  231.  
  232. /*
  233.  * Level 1 Tokeniser Routines
  234.  */
  235.  
  236.  
  237. /*
  238.  * Default mode - deal with character when state is unknown
  239.  * globals:
  240.  *   int               line         current line in the source file
  241.  *   int               pos          current character position in the source
  242.  *   int               start_line   line on which the current token started
  243.  *   int               start_pos    char pos on which the current token started
  244.  * params:
  245.  *   TokeniserState*   state        current state of the tokeniser
  246.  */
  247. static void default_mode (TokeniserState *state) {
  248.  
  249.   /* deal with non-EOL whitespace */
  250.   if (state->ch == ' ' ||
  251.       state->ch == '\t') {
  252.     state->ch = read_character (state);
  253.     data->start_line = data->line;
  254.     data->start_pos = data->pos;
  255.   }
  256.  
  257.   /* deal with EOL whitespace */
  258.   else if (state->ch == '\n') {
  259.     data->start_line = data->line - 1;
  260.     data->start_pos = data->pos;
  261.     state->token = new_Token_init
  262.       (TOKEN_EOL, data->start_line, data->start_pos, state->content);
  263.   }
  264.  
  265.   /* alphabetic characters start a word */
  266.   else if ((state->ch >= 'A' && state->ch <= 'Z') ||
  267.            (state->ch >= 'a' && state->ch <= 'z')) {
  268.     data->start_line = data->line;
  269.     data->start_pos = data->pos;
  270.     state->mode = WORD_MODE;
  271.   }
  272.  
  273.   /* digits start a number */
  274.   else if (state->ch >= '0' && state->ch <= '9')
  275.     state->mode = NUMBER_MODE;
  276.  
  277.   /* check for tokens starting with less-than (<, <=, <>) */
  278.   else if (state->ch == '<') {
  279.     data->start_line = data->line;
  280.     data->start_pos = data->pos;
  281.     store_character (state);
  282.     state->ch = read_character (state);
  283.     state->mode = LESS_THAN_MODE;
  284.   }
  285.  
  286.   /* check for tokens starting with greater-than (>, >=) */
  287.   else if (state->ch == '>') {
  288.     data->start_line = data->line;
  289.     data->start_pos = data->pos;
  290.     store_character (state);
  291.     state->ch = read_character (state);
  292.     state->mode = GREATER_THAN_MODE;
  293.   }
  294.  
  295.   /* deal with other symbol operators */
  296.   else if (strchr ("+-*/=(),", state->ch) != NULL) {
  297.     data->start_line = data->line;
  298.     data->start_pos = data->pos;
  299.     store_character (state);
  300.     state->token = new_Token_init (identify_symbol (state->ch),
  301.       data->start_line, data->start_pos, state->content);
  302.   }
  303.  
  304.   /* double quotes start a string literal */
  305.   else if (state->ch == '"') {
  306.     data->start_line = data->line;
  307.     data->start_pos = data->pos;
  308.     state->ch = read_character (state);
  309.     state->mode = STRING_LITERAL_MODE;
  310.   }
  311.  
  312.   /* detect end of file */
  313.   else if (state->ch == EOF) {
  314.     data->start_line = data->line;
  315.     data->start_pos = data->pos;
  316.     state->token = new_Token_init
  317.       (TOKEN_EOF, data->start_line, data->start_pos, state->content);
  318.   }
  319.  
  320.   /* other characters are illegal */
  321.   else {
  322.     data->start_line = data->line;
  323.     data->start_pos = data->pos;
  324.     store_character (state);
  325.     state->token = new_Token_init
  326.       (TOKEN_ILLEGAL, data->start_line, data->start_pos, state->content);
  327.   }
  328. }
  329.  
  330. /*
  331.  * Word mode - deal with character when building a word token
  332.  * globals:
  333.  *   int               start_line   line on which the current token started
  334.  *   int               start_pos    char pos on which the current token started
  335.  * params:
  336.  *   TokeniserState*   state        current state of the tokeniser
  337.  */
  338. static void word_mode (TokeniserState *state) {
  339.  
  340.   /* local variables */
  341.   TokenClass class; /* recognised class of keyword */
  342.    
  343.   /* add letters and digits to the token */
  344.   if ((state->ch >= 'A' && state->ch <= 'Z') ||
  345.       (state->ch >= 'a' && state->ch <= 'z')) {
  346.     store_character (state);
  347.     state->ch = read_character (state);
  348.   }
  349.    
  350.   /* other characters are pushed back for the next token */
  351.   else {
  352.     if (state->ch != EOF)
  353.       unread_character (state);
  354.     class = identify_word (state->content);
  355.     if (class == TOKEN_REM) {
  356.       *state->content = '\0';
  357.       state->mode = COMMENT_MODE;
  358.     }
  359.     else
  360.       state->token = new_Token_init
  361.         (class, data->start_line, data->start_pos, state->content);
  362.   }
  363. }
  364.  
  365. /*
  366.  * Comment mode - skip till end of line after a REM
  367.  * globals:
  368.  *   int               start_line   line on which the current token started
  369.  *   int               start_pos    char pos on which the current token started
  370.  * params:
  371.  *   TokeniserState*   state        current state of the tokeniser
  372.  */
  373. static void comment_mode (TokeniserState *state) {
  374.   if (state->ch == '\n')
  375.     state->mode = DEFAULT_MODE;
  376.   else
  377.     state->ch = read_character (state);
  378. }
  379.  
  380. /*
  381.  * Number mode - building a number token (integer only)
  382.  * globals:
  383.  *   int               start_line   line on which the current token started
  384.  *   int               start_pos    char pos on which the current token started
  385.  * params:
  386.  *   TokeniserState*   state        current state of the tokeniser
  387.  */
  388. static void number_mode (TokeniserState *state) {
  389.  
  390.   /* add digits to the token */
  391.   if (state->ch >= '0' && state->ch <= '9') {
  392.     store_character (state);
  393.     state->ch = read_character (state);
  394.   }
  395.    
  396.   /* other characters are pushed back for the next token */
  397.   else {
  398.     if (state->ch != EOF)
  399.       unread_character (state);
  400.     state->token = new_Token_init
  401.       (TOKEN_NUMBER, data->start_line, data->start_pos, state->content);
  402.   }
  403.  
  404. }
  405.  
  406. /*
  407.  * Less than mode - checking for <> and <= operators
  408.  * globals:
  409.  *   int               start_line   line on which the current token started
  410.  *   int               start_pos    char pos on which the current token started
  411.  * params:
  412.  *   TokeniserState*   state        current state of the tokeniser
  413.  */
  414. static void less_than_mode (TokeniserState *state) {
  415.   if (state->ch == '=' || state->ch == '>')
  416.     store_character (state);
  417.   else
  418.     unread_character (state);
  419.   state->token = new_Token_init
  420.     (identify_compound_symbol (state->content), data->start_line,
  421.      data->start_pos, state->content);
  422. }
  423.  
  424. /*
  425.  * Greater than mode - checking for >= and >< operators
  426.  * globals:
  427.  *   int               start_line   line on which the current token started
  428.  *   int               start_pos    char pos on which the current token started
  429.  * params:
  430.  *   TokeniserState*   state        current state of the tokeniser
  431.  */
  432. static void greater_than_mode (TokeniserState *state) {
  433.   if (state->ch == '=' || state->ch == '<')
  434.     store_character (state);
  435.   else
  436.   ungetc (state->ch, data->input);
  437.   state->token = new_Token_init
  438.     (identify_compound_symbol (state->content), data->start_line,
  439.      data->start_pos, state->content);
  440. }
  441.  
  442. /*
  443.  * String literal mode - reading a string
  444.  * globals:
  445.  *   int               start_line   line on which the current token started
  446.  *   int               start_pos    char pos on which the current token started
  447.  * params:
  448.  *   TokeniserState*   state        current state of the tokeniser
  449.  */
  450. static void string_literal_mode (TokeniserState *state) {
  451.  
  452.   /* a quote terminates the string */
  453.   if (state->ch == '"')
  454.     state->token = new_Token_init
  455.       (TOKEN_STRING, data->start_line, data->start_pos, state->content);
  456.  
  457.   /* a backslash escapes the next character */
  458.   else if (state->ch == '\\') {
  459.     state->ch = read_character (state);
  460.     store_character (state);
  461.     state->ch = read_character (state);
  462.   }
  463.  
  464.   /* EOF generates an error */
  465.   else if (state->ch == EOF)
  466.     state->token = new_Token_init
  467.       (TOKEN_ILLEGAL, data->start_line, data->start_pos, state->content);
  468.  
  469.   /* all other characters are part of the string */
  470.   else {
  471.     store_character (state);
  472.     state->ch = read_character (state);
  473.   }
  474. }
  475.  
  476.  
  477. /*
  478.  * Top Level Tokeniser Routines
  479.  */
  480.  
  481.  
  482. /*
  483.  * Get the next token
  484.  * params:
  485.  *   TokenStream*   token_stream   the token stream being processed
  486.  * returns:
  487.  *   Token*                        the token built
  488.  */
  489. static Token *next (TokenStream *token_stream) {
  490.  
  491.   /* local variables */
  492.   TokeniserState state; /* current state of reading */
  493.   Token *return_token; /* token to return */
  494.  
  495.   /* initialise */
  496.   this = token_stream;
  497.   data = this->data;
  498.   state.token = NULL;
  499.   state.mode = DEFAULT_MODE;
  500.   state.max = 1024;
  501.   state.content = malloc (state.max);
  502.   *(state.content) = '\0';
  503.   state.ch = read_character (&state);
  504.   /* main loop */
  505.   while (state.token == NULL) {
  506.     switch (state.mode) {
  507.     case DEFAULT_MODE:
  508.  
  509.       default_mode (&state);
  510.       break;
  511.     case COMMENT_MODE:
  512.       comment_mode (&state);
  513.       break;
  514.     case WORD_MODE:
  515.       word_mode (&state);
  516.       break;
  517.     case NUMBER_MODE:
  518.       number_mode (&state);
  519.       break;
  520.     case LESS_THAN_MODE:
  521.       less_than_mode (&state);
  522.       break;
  523.     case GREATER_THAN_MODE:
  524.       greater_than_mode (&state);
  525.       break;
  526.     case STRING_LITERAL_MODE:
  527.       string_literal_mode (&state);
  528.       break;
  529.     default:
  530.       state.token = new_Token_init
  531.         (TOKEN_EOF, data->start_line, data->start_pos, state.content);
  532.       state.ch = EOF; /* temporary hack */
  533.     }
  534.   }
  535.  
  536.   /* store token and release state memory */
  537.   return_token = state.token;
  538.   free (state.content);
  539.  
  540.   /* return result */
  541.   return return_token;
  542.  
  543. }
  544.  
  545. /*
  546.  * Getter for the current line number
  547.  * paramss:
  548.  *   TokenStream*   token_stream   the token stream being processed
  549.  * returns:
  550.  *   int                           the current line number returned
  551.  */
  552. static int get_line (TokenStream *token_stream) {
  553.   this = token_stream;
  554.   data = this->data;
  555.   return data->line;
  556. }
  557.  
  558. /*
  559.  * Destructor for a TokenStream
  560.  * params:
  561.  *   TokenStream*   token_stream   the doomed token stream
  562.  */
  563. static void destroy (TokenStream *token_stream) {
  564.   if (token_stream) {
  565.     if (token_stream->data)
  566.       free (token_stream->data);
  567.     free (token_stream);
  568.   }
  569. }
  570.  
  571.  
  572. /*
  573.  * Constructors
  574.  */
  575.  
  576.  
  577. /*
  578.  * Constructor for TokenStream
  579.  * params:
  580.  *   FILE*   input   Input file
  581.  * returns:
  582.  *   TokenStream*    The new token stream
  583.  */
  584. TokenStream *new_TokenStream (FILE *input) {
  585.  
  586.   /* allocate the memory */
  587.   this = malloc (sizeof (TokenStream));
  588.   this->data = data = malloc (sizeof (Private));
  589.  
  590.   /* initialise methods */
  591.   this->next = next;
  592.   this->get_line = get_line;
  593.   this->destroy = destroy;
  594.  
  595.   /* initialise data */
  596.   data->input = input;
  597.   data->line = data->start_line = 1;
  598.   data->pos = data->start_pos = 0;
  599.  
  600.   /* return new token stream */
  601.   return this;
  602. }
  603.