Erythro/src/Lexer.c

370 lines
8.6 KiB
C
Raw Normal View History

/*************/
/*GEMWIRE */
/* ERYTHRO*/
/*************/
#include <Defs.h>
#include <Data.h>
/* * * * * * * * * * * * * * * * * * * * * * * * * * * *
* * * * * * C H A R S T R E AM * * * * * *
* * * * * * * * * * * * * * * * * * * * * * * * * * * */
static void ReturnCharToStream(int Char) {
Overread = Char;
}
static int NextChar(void) {
int Char;
if(Overread) {
Char = Overread;
Overread = 0;
return Char;
}
Char = fgetc(SourceFile);
if(Char == '\n')
Line++;
return Char;
}
static int FindChar() {
int Char;
Char = NextChar();
while(Char == ' ' || Char == '\t' || Char == '\n' || Char == '\r') {
Char = NextChar();
}
return Char;
}
static int FindDigitFromPos(char* String, char Char) {
char* Result = strchr(String, Char);
return(Result ? Result - String : -1);
}
void VerifyToken(int Type, char* TokenExpected) {
if(CurrentToken.type == Type)
Tokenise(&CurrentToken);
else {
printf("Expected %s on line %d\n", TokenExpected, Line);
exit(1);
}
}
static struct Token* RejectedToken = NULL;
void RejectToken(struct Token* Token) {
if(RejectedToken != NULL)
Die("Cannot reject two tokens in a row!");
RejectedToken = Token;
}
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* * * * L I T E R A L S A N D I D E N T I F I E R S * * * *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
static int ReadInteger(int Char) {
int CurrentChar = 0;
int IntegerValue = 0;
while((CurrentChar = FindDigitFromPos("0123456789", Char)) >= 0) {
IntegerValue = IntegerValue * 10 + CurrentChar;
Char = NextChar();
}
ReturnCharToStream(Char);
return IntegerValue;
}
// Variable identifier, keyword, function.
static int ReadIdentifier(int Char, char* Buffer, int Limit) {
int ind = 0;
// This defines the valid chars in a keyword/variable/function.
while(isalpha(Char) || isdigit(Char) || Char == '_') {
if (ind >= Limit - 1) {
printf("Identifier too long: %d\n", Line);
exit(1);
} else {
Buffer[ind++] = Char;
}
Char = NextChar();
}
// At this point, we've reached a non-keyword character
ReturnCharToStream(Char);
Buffer[ind] = '\0';
return ind;
}
2020-11-22 00:41:48 +00:00
static int ReadCharLiteral() {
int Char;
Char = NextChar();
if(Char == '\\') {
switch(Char = NextChar()) {
case 'a': return '\a';
case 'b': return '\b';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'v': return '\v';
case '\\': return '\\';
case '"': return '"';
case '\'': return '\'';
default:
DieChar("Unknown Escape: ", Char);
2020-11-22 00:41:48 +00:00
}
}
return Char;
}
/*
* This function is what defines the valid keywords for the language
* //TODO: move this to a static list?
* //TODO: More optimisations?
*
*/
static int ReadKeyword(char* Str) {
// First, scan with reference intact.
switch(*Str) {
// This lets us case against the first char:
case ':':
if(!strcmp(Str, "::"))
return KW_FUNC;
break;
case 'c':
if(!strcmp(Str, "char"))
return TY_CHAR;
break;
case 'e':
if(!strcmp(Str, "else"))
return KW_ELSE;
break;
case 'f':
if(!strcmp(Str, "for"))
return KW_FOR;
break;
case 'i':
if(!strcmp(Str, "int"))
return TY_INT;
if(!strcmp(Str, "if"))
return KW_IF;
break;
case 'l':
if(!strcmp(Str, "long"))
return TY_LONG;
break;
case 'p':
// This is a huge optimisation once we have as many keywords as a fully featured language.
if(!strcmp(Str, "print"))
return KW_PRINT;
break;
case 'r':
if(!strcmp(Str, "return"))
return KW_RETURN;
break;
case 'v':
if(!strcmp(Str, "void"))
return TY_VOID;
break;
case 'w':
if(!strcmp(Str, "while"))
return KW_WHILE;
break;
}
return 0;
}
/* * * * * * * * * * * * * * * * * * * * *
* * * * T O K E N I S E R * * * *
* * * * * * * * * * * * * * * * * * * * */
int Tokenise(struct Token* Token) {
int Char, TokenType;
if(RejectedToken != NULL) {
Token = RejectedToken;
RejectedToken = NULL;
return 1;
}
Char = FindChar();
switch(Char) {
case EOF:
Token->type = LI_EOF;
return 0;
case '+':
Token->type = AR_PLUS;
break;
case '-':
Token->type = AR_MINUS;
break;
case '*':
Token->type = AR_STAR;
break;
case '/':
Token->type = AR_SLASH;
break;
case '&':
Token->type = LI_AMP;
break;
case ',':
Token->type = LI_COM;
break;
case '=':
Char = NextChar();
// If the next char is =, we have ==, the compare equality token.
if(Char == '?') {
Token->type = CMP_EQUAL;
// if the next char is >, we have =>, the greater than or equal token.
} else if(Char == '>') {
Token->type = CMP_GTE;
// If none of the above match, we have = and an extra char. Return the char and set the token
} else {
ReturnCharToStream(Char);
Token->type = LI_EQUAL;
}
break;
case '!':
Char = NextChar();
// If the next char is =, we have !=, the compare inequality operator.
if(Char == '=') {
Token->type = CMP_INEQ;
// Otherwise, we have a spare char
} else {
ReturnCharToStream(Char);
}
break;
case '<':
Char = NextChar();
// If the next char is =, we have <=, the less than or equal comparator.
if(Char == '=') {
Token->type = CMP_LTE;
} else {
ReturnCharToStream(Char);
Token->type = CMP_LT;
}
break;
case '>':
// There is no special casing for >. Less than or equal is =>
Token->type = CMP_GT;
break;
case ';':
Token->type = LI_SEMIC;
break;
case '(':
Token->type = LI_LPARE;
break;
case ')':
Token->type = LI_RPARE;
break;
case '{':
Token->type = LI_LBRAC;
break;
case '}':
Token->type = LI_RBRAC;
break;
2020-11-18 20:49:08 +00:00
case '[':
Token->type = LI_LBRAS;
break;
case ']':
Token->type = LI_RBRAS;
break;
case ':':
Char = NextChar();
if(Char == ':') {
Token->type = KW_FUNC;
} else {
ReturnCharToStream(Char);
}
break;
2020-11-22 00:41:48 +00:00
case '\'':
Token->value = ReadCharLiteral();
Token->type = LI_INT;
if(NextChar() != '\'')
Die("Expected '\\'' at the end of a character.");
break;
default:
if(isdigit(Char)) {
Token->value = ReadInteger(Char);
Token->type = LI_INT;
break;
} else if(isalpha(Char) || Char == '_') { // This is what defines what a variable/function/keyword can START with.
ReadIdentifier(Char, CurrentIdentifier, TEXTLEN);
if(TokenType = ReadKeyword(CurrentIdentifier)) {
Token->type = TokenType;
break;
}
Token->type = TY_IDENTIFIER;
break;
//printf("Line %d: Unrecognized symbol %s\n", CurrentIdentifier, Line);
//exit(1);
}
DieChar("Unrecognized character", Char);
}
return 1;
}