Erythro/src/Lexer.c


/*************/
/*GEMWIRE    */
/*    ERYTHRO*/
/*************/

#include <Defs.h>
#include <Data.h>


/* * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * * * * * *    C H A R       S T R E AM     * * * * * *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * */
static void ReturnCharToStream(int Char) {
    Overread = Char;
}

static int NextChar(void) {
    int Char;

    if(Overread) {
        Char = Overread;
        Overread = 0;
        return Char;
    }

    Char = fgetc(SourceFile);

    if(Char == '\n')
        Line++;
    
    return Char;
}


static int FindChar() {
    int Char;

    Char = NextChar();

    while(Char == ' ' || Char == '\t' || Char == '\n' || Char == '\r') {
        Char = NextChar();
    }

    return Char;
}

static int FindDigitFromPos(char* String, char Char) {
    char* Result = strchr(String, Char);
    return(Result ? Result - String : -1);
}

void VerifyToken(int Type, char* TokenExpected) {
    if(CurrentToken.type == Type)
        Tokenise(&CurrentToken);
    else {
        printf("Expected %s on line %d\n", TokenExpected, Line);
        exit(1);
    }
}

static struct Token* RejectedToken = NULL;

void RejectToken(struct Token* Token) {
    if(RejectedToken != NULL)
        Die("Cannot reject two tokens in a row!");
    
    RejectedToken = Token;
}

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * * * *     L I T E R A L S   A N D   I D E N T I F I E R S     * * * *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

static int ReadInteger(int Char) {
    int CurrentChar = 0;
    int IntegerValue = 0;

    while((CurrentChar = FindDigitFromPos("0123456789", Char)) >= 0) {
        IntegerValue = IntegerValue * 10 + CurrentChar;
        Char = NextChar();
    }

    ReturnCharToStream(Char);

    return IntegerValue;
}

// Variable identifier, keyword, function.
static int ReadIdentifier(int Char, char* Buffer, int Limit) {
    int ind = 0;   

    // This defines the valid chars in a keyword/variable/function.
    while(isalpha(Char) || isdigit(Char) || Char == '_') {
        if (ind >= Limit - 1) {
            printf("Identifier too long: %d\n", Line);
            exit(1);
        } else {
            Buffer[ind++] = Char;
        }

        Char = NextChar();
    }

    // At this point, we've reached a non-keyword character
    ReturnCharToStream(Char);
    Buffer[ind] = '\0';
    return ind;
}

static int ReadCharLiteral() {
    int Char;
    Char = NextChar();
    if(Char == '\\') {
        switch(Char = NextChar()) {
            case 'a': return '\a';
            case 'b': return '\b';
            case 'f': return '\f';
            case 'n': return '\n';
            case 'r': return '\r';
            case 't': return '\t';
            case 'v': return '\v';
            case '\\': return '\\';
            case '"': return '"';
            case '\'': return '\'';
            default:
                DieChar("Unknown Escape: ", Char);
        }
    }

    return Char;
}

/*
 * This function is what defines the valid keywords for the language
 * //TODO: move this to a static list?
 * //TODO: More optimisations?
 * 
 */
static int ReadKeyword(char* Str) {
    // First, scan with reference intact.
    switch(*Str) {
        // This lets us case against the first char:
        case ':':
            if(!strcmp(Str, "::"))
                return KW_FUNC;
            break;

        case 'c':
            if(!strcmp(Str, "char"))
                return TY_CHAR;
            break;

        case 'e':
            if(!strcmp(Str, "else"))
                return KW_ELSE;

            break;
    
        case 'f':
            if(!strcmp(Str, "for"))
                return KW_FOR;
            break;

        case 'i':

            if(!strcmp(Str, "int"))
                return TY_INT;
            
            if(!strcmp(Str, "if"))
                return KW_IF;

            break;

        case 'l':
            if(!strcmp(Str, "long"))
                return TY_LONG;

            break;

        case 'p':
            // This is a huge optimisation once we have as many keywords as a fully featured language.
            if(!strcmp(Str, "print"))
                return KW_PRINT;
            break;

        case 'r':
            if(!strcmp(Str, "return"))
                return KW_RETURN;
            break;
            
        case 'v':
            if(!strcmp(Str, "void"))
                return TY_VOID;
            break;
            
        case 'w':
            if(!strcmp(Str, "while"))
                return KW_WHILE;
            break;


    }

    return 0;
}

/* * * * * * * * * * * * * * * * * * * * *
 * * * *      T O K E N I S E R    * * * *
 * * * * * * * * * * * * * * * * * * * * */

int Tokenise(struct Token* Token) {
    int Char, TokenType;

    if(RejectedToken != NULL) {
        Token = RejectedToken;
        RejectedToken = NULL;
        return 1;
    }

    Char = FindChar();

    switch(Char) {
        case EOF:
            Token->type = LI_EOF;
            return 0;

        case '+':
            Token->type = AR_PLUS;
            break;

        case '-':
            Token->type = AR_MINUS;
            break;

        case '*':
            Token->type = AR_STAR;
            break;

        case '/':
            Token->type = AR_SLASH;
            break;

        case '&':
            Token->type = LI_AMP;
            break;
        
        case ',':
            Token->type = LI_COM;
            break;
        
        case '=':
            Char = NextChar();
            // If the next char is =, we have ==, the compare equality token.
            if(Char == '?') {
                Token->type = CMP_EQUAL;
            // if the next char is >, we have =>, the greater than or equal token.
            } else if(Char == '>') {
                Token->type = CMP_GTE;
            // If none of the above match, we have = and an extra char. Return the char and set the token
            } else {
                ReturnCharToStream(Char);
                Token->type = LI_EQUAL;
            }
            break;
        
        case '!':
            Char = NextChar();
            // If the next char is =, we have !=, the compare inequality operator.
            if(Char == '=') {
                Token->type = CMP_INEQ;
            // Otherwise, we have a spare char
            } else {
                ReturnCharToStream(Char);            
            }
            break;

        case '<':
            Char = NextChar();
            // If the next char is =, we have <=, the less than or equal comparator.
            if(Char == '=') {
                Token->type = CMP_LTE;
            } else {
                ReturnCharToStream(Char);
                Token->type = CMP_LT;
            }
            break;

        case '>':
            // There is no special casing for >. Less than or equal is =>
            Token->type = CMP_GT;
            break;

        case ';':
            Token->type = LI_SEMIC;
            break;

        case '(':
            Token->type = LI_LPARE;
            break;
        
        case ')':
            Token->type = LI_RPARE;
            break;
        
        case '{':
            Token->type = LI_LBRAC;
            break;

        case '}':
            Token->type = LI_RBRAC;
            break;

        case '[':
            Token->type = LI_LBRAS;
            break;
        
        case ']':
            Token->type = LI_RBRAS;
            break;
            
        case ':':
            Char = NextChar();

            if(Char == ':') {
                Token->type = KW_FUNC;
            } else {
                ReturnCharToStream(Char);
            }
            break;

        case '\'':
            Token->value = ReadCharLiteral();
            Token->type = LI_INT;

            if(NextChar() != '\'')
                Die("Expected '\\'' at the end of a character.");
            break;

        default:
            if(isdigit(Char)) {

                Token->value = ReadInteger(Char);
                Token->type = LI_INT;
                break;
            
            } else if(isalpha(Char) || Char == '_') { // This is what defines what a variable/function/keyword can START with.
                ReadIdentifier(Char, CurrentIdentifier, TEXTLEN);

                if(TokenType = ReadKeyword(CurrentIdentifier)) {
                    Token->type = TokenType;
                    break;
                }
                
                Token->type = TY_IDENTIFIER;
                break;
                //printf("Line %d: Unrecognized symbol %s\n", CurrentIdentifier, Line);
                //exit(1);
            }

            
            DieChar("Unrecognized character", Char);

    }

    return 1;
}
First files. Currently has two bugs. First, all functions are resolved to index 0 (currently PrintInteger) Second, the register used for returning is immediately overwritten by the next allocated register. This means addition of function return values is a little silly. Also, commit signing! 2020-09-10 00:56:16 +00:00
			`/*************/`
			`/GEMWIRE /`
			`/* ERYTHRO*/`
			`/*************/`

			`#include <Defs.h>`
			`#include <Data.h>`


			`/* * * * * * * * * * * * * * * * * * * * * * * * * * * *`
			`* * * * * * C H A R S T R E AM * * * * * *`
			`* * * * * * * * * * * * * * * * * * * * * * * * * * * */`
			`static void ReturnCharToStream(int Char) {`
			`Overread = Char;`
			`}`

			`static int NextChar(void) {`
			`int Char;`

			`if(Overread) {`
			`Char = Overread;`
			`Overread = 0;`
			`return Char;`
			`}`

			`Char = fgetc(SourceFile);`

			`if(Char == '\n')`
			`Line++;`

			`return Char;`
			`}`


			`static int FindChar() {`
			`int Char;`

			`Char = NextChar();`

			`while(Char == ' ' \|\| Char == '\t' \|\| Char == '\n' \|\| Char == '\r') {`
			`Char = NextChar();`
			`}`

			`return Char;`
			`}`

			`static int FindDigitFromPos(char* String, char Char) {`
			`char* Result = strchr(String, Char);`
			`return(Result ? Result - String : -1);`
			`}`

			`void VerifyToken(int Type, char* TokenExpected) {`
			`if(CurrentToken.type == Type)`
			`Tokenise(&CurrentToken);`
			`else {`
			`printf("Expected %s on line %d\n", TokenExpected, Line);`
			`exit(1);`
			`}`
			`}`

			`static struct Token* RejectedToken = NULL;`

			`void RejectToken(struct Token* Token) {`
			`if(RejectedToken != NULL)`
			`Die("Cannot reject two tokens in a row!");`

			`RejectedToken = Token;`
			`}`

			`/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *`
			`* * * * L I T E R A L S A N D I D E N T I F I E R S * * * *`
			`* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */`

			`static int ReadInteger(int Char) {`
			`int CurrentChar = 0;`
			`int IntegerValue = 0;`

			`while((CurrentChar = FindDigitFromPos("0123456789", Char)) >= 0) {`
			`IntegerValue = IntegerValue * 10 + CurrentChar;`
			`Char = NextChar();`
			`}`

			`ReturnCharToStream(Char);`

			`return IntegerValue;`
			`}`

			`// Variable identifier, keyword, function.`
			`static int ReadIdentifier(int Char, char* Buffer, int Limit) {`
			`int ind = 0;`

			`// This defines the valid chars in a keyword/variable/function.`
			`while(isalpha(Char) \|\| isdigit(Char) \|\| Char == '_') {`
			`if (ind >= Limit - 1) {`
			`printf("Identifier too long: %d\n", Line);`
			`exit(1);`
			`} else {`
			`Buffer[ind++] = Char;`
			`}`

			`Char = NextChar();`
			`}`

			`// At this point, we've reached a non-keyword character`
			`ReturnCharToStream(Char);`
			`Buffer[ind] = '\0';`
			`return ind;`
			`}`

Add character literal parsing 2020-11-22 00:41:48 +00:00			`static int ReadCharLiteral() {`
			`int Char;`
			`Char = NextChar();`
			`if(Char == '\\') {`
			`switch(Char = NextChar()) {`
			`case 'a': return '\a';`
			`case 'b': return '\b';`
			`case 'f': return '\f';`
			`case 'n': return '\n';`
			`case 'r': return '\r';`
			`case 't': return '\t';`
			`case 'v': return '\v';`
			`case '\\': return '\\';`
			`case '"': return '"';`
			`case '\'': return '\'';`
			`default:`
Fix typo Lost my train of thought while typing this. 2020-11-22 00:43:32 +00:00			`DieChar("Unknown Escape: ", Char);`
Add character literal parsing 2020-11-22 00:41:48 +00:00			`}`
			`}`

			`return Char;`
			`}`

First files. Currently has two bugs. First, all functions are resolved to index 0 (currently PrintInteger) Second, the register used for returning is immediately overwritten by the next allocated register. This means addition of function return values is a little silly. Also, commit signing! 2020-09-10 00:56:16 +00:00			`/*`
			`* This function is what defines the valid keywords for the language`
			`* //TODO: move this to a static list?`
			`* //TODO: More optimisations?`
			`*`
			`*/`
			`static int ReadKeyword(char* Str) {`
			`// First, scan with reference intact.`
			`switch(*Str) {`
			`// This lets us case against the first char:`
			`case ':':`
			`if(!strcmp(Str, "::"))`
			`return KW_FUNC;`
			`break;`

			`case 'c':`
			`if(!strcmp(Str, "char"))`
			`return TY_CHAR;`
			`break;`

			`case 'e':`
			`if(!strcmp(Str, "else"))`
			`return KW_ELSE;`

			`break;`

			`case 'f':`
			`if(!strcmp(Str, "for"))`
			`return KW_FOR;`
			`break;`

			`case 'i':`

			`if(!strcmp(Str, "int"))`
			`return TY_INT;`

			`if(!strcmp(Str, "if"))`
			`return KW_IF;`

			`break;`

			`case 'l':`
			`if(!strcmp(Str, "long"))`
			`return TY_LONG;`

			`break;`

			`case 'p':`
			`// This is a huge optimisation once we have as many keywords as a fully featured language.`
			`if(!strcmp(Str, "print"))`
			`return KW_PRINT;`
			`break;`

			`case 'r':`
			`if(!strcmp(Str, "return"))`
			`return KW_RETURN;`
			`break;`

			`case 'v':`
			`if(!strcmp(Str, "void"))`
			`return TY_VOID;`
			`break;`

			`case 'w':`
			`if(!strcmp(Str, "while"))`
			`return KW_WHILE;`
			`break;`



			`}`

			`return 0;`
			`}`

			`/* * * * * * * * * * * * * * * * * * * * *`
			`* * * * T O K E N I S E R * * * *`
			`* * * * * * * * * * * * * * * * * * * * */`

			`int Tokenise(struct Token* Token) {`
			`int Char, TokenType;`

			`if(RejectedToken != NULL) {`
			`Token = RejectedToken;`
			`RejectedToken = NULL;`
			`return 1;`
			`}`

			`Char = FindChar();`

			`switch(Char) {`
			`case EOF:`
			`Token->type = LI_EOF;`
			`return 0;`

			`case '+':`
			`Token->type = AR_PLUS;`
			`break;`

			`case '-':`
			`Token->type = AR_MINUS;`
			`break;`

			`case '*':`
			`Token->type = AR_STAR;`
			`break;`

			`case '/':`
			`Token->type = AR_SLASH;`
			`break;`
Add support for pointers of char, int and long types 2020-09-13 01:26:49 +00:00
			`case '&':`
			`Token->type = LI_AMP;`
			`break;`
First files. Currently has two bugs. First, all functions are resolved to index 0 (currently PrintInteger) Second, the register used for returning is immediately overwritten by the next allocated register. This means addition of function return values is a little silly. Also, commit signing! 2020-09-10 00:56:16 +00:00
Allow global-scope declarations Function-local scope is still WIP, but you can now define things outside of function blocks. 2020-09-13 22:41:46 +00:00			`case ',':`
			`Token->type = LI_COM;`
			`break;`

First files. Currently has two bugs. First, all functions are resolved to index 0 (currently PrintInteger) Second, the register used for returning is immediately overwritten by the next allocated register. This means addition of function return values is a little silly. Also, commit signing! 2020-09-10 00:56:16 +00:00			`case '=':`
			`Char = NextChar();`
			`// If the next char is =, we have ==, the compare equality token.`
			`if(Char == '?') {`
			`Token->type = CMP_EQUAL;`
			`// if the next char is >, we have =>, the greater than or equal token.`
			`} else if(Char == '>') {`
			`Token->type = CMP_GTE;`
			`// If none of the above match, we have = and an extra char. Return the char and set the token`
			`} else {`
			`ReturnCharToStream(Char);`
			`Token->type = LI_EQUAL;`
			`}`
			`break;`

			`case '!':`
			`Char = NextChar();`
			`// If the next char is =, we have !=, the compare inequality operator.`
			`if(Char == '=') {`
			`Token->type = CMP_INEQ;`
			`// Otherwise, we have a spare char`
			`} else {`
			`ReturnCharToStream(Char);`
			`}`
			`break;`

			`case '<':`
			`Char = NextChar();`
			`// If the next char is =, we have <=, the less than or equal comparator.`
			`if(Char == '=') {`
			`Token->type = CMP_LTE;`
			`} else {`
			`ReturnCharToStream(Char);`
			`Token->type = CMP_LT;`
			`}`
			`break;`

			`case '>':`
			`// There is no special casing for >. Less than or equal is =>`
			`Token->type = CMP_GT;`
			`break;`

			`case ';':`
			`Token->type = LI_SEMIC;`
			`break;`

			`case '(':`
			`Token->type = LI_LPARE;`
			`break;`

			`case ')':`
			`Token->type = LI_RPARE;`
			`break;`

			`case '{':`
			`Token->type = LI_LBRAC;`
			`break;`

			`case '}':`
			`Token->type = LI_RBRAC;`
			`break;`

Start work on array parsing 2020-11-18 20:49:08 +00:00			`case '[':`
			`Token->type = LI_LBRAS;`
			`break;`

			`case ']':`
			`Token->type = LI_RBRAS;`
			`break;`

First files. Currently has two bugs. First, all functions are resolved to index 0 (currently PrintInteger) Second, the register used for returning is immediately overwritten by the next allocated register. This means addition of function return values is a little silly. Also, commit signing! 2020-09-10 00:56:16 +00:00			`case ':':`
			`Char = NextChar();`

			`if(Char == ':') {`
			`Token->type = KW_FUNC;`
			`} else {`
			`ReturnCharToStream(Char);`
			`}`
			`break;`

Add character literal parsing 2020-11-22 00:41:48 +00:00			`case '\'':`
			`Token->value = ReadCharLiteral();`
			`Token->type = LI_INT;`

			`if(NextChar() != '\'')`
			`Die("Expected '\\'' at the end of a character.");`
			`break;`

First files. Currently has two bugs. First, all functions are resolved to index 0 (currently PrintInteger) Second, the register used for returning is immediately overwritten by the next allocated register. This means addition of function return values is a little silly. Also, commit signing! 2020-09-10 00:56:16 +00:00			`default:`
			`if(isdigit(Char)) {`

			`Token->value = ReadInteger(Char);`
			`Token->type = LI_INT;`
			`break;`

			`} else if(isalpha(Char) \|\| Char == '_') { // This is what defines what a variable/function/keyword can START with.`
			`ReadIdentifier(Char, CurrentIdentifier, TEXTLEN);`

			`if(TokenType = ReadKeyword(CurrentIdentifier)) {`
			`Token->type = TokenType;`
			`break;`
			`}`

			`Token->type = TY_IDENTIFIER;`
			`break;`
			`//printf("Line %d: Unrecognized symbol %s\n", CurrentIdentifier, Line);`
			`//exit(1);`
			`}`


			`DieChar("Unrecognized character", Char);`

			`}`

			`return 1;`
			`}`