2020-09-10 00:56:16 +00:00
|
|
|
|
|
|
|
/*************/
|
|
|
|
/*GEMWIRE */
|
|
|
|
/* ERYTHRO*/
|
|
|
|
/*************/
|
|
|
|
|
|
|
|
#include <Defs.h>
|
|
|
|
#include <Data.h>
|
|
|
|
|
|
|
|
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
|
|
|
* * * * * * C H A R S T R E AM * * * * * *
|
|
|
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
|
|
|
static void ReturnCharToStream(int Char) {
|
|
|
|
Overread = Char;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int NextChar(void) {
|
|
|
|
int Char;
|
|
|
|
|
|
|
|
if(Overread) {
|
|
|
|
Char = Overread;
|
|
|
|
Overread = 0;
|
|
|
|
return Char;
|
|
|
|
}
|
|
|
|
|
|
|
|
Char = fgetc(SourceFile);
|
|
|
|
|
|
|
|
if(Char == '\n')
|
|
|
|
Line++;
|
|
|
|
|
|
|
|
return Char;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int FindChar() {
|
|
|
|
int Char;
|
|
|
|
|
|
|
|
Char = NextChar();
|
|
|
|
|
|
|
|
while(Char == ' ' || Char == '\t' || Char == '\n' || Char == '\r') {
|
|
|
|
Char = NextChar();
|
|
|
|
}
|
|
|
|
|
|
|
|
return Char;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int FindDigitFromPos(char* String, char Char) {
|
|
|
|
char* Result = strchr(String, Char);
|
|
|
|
return(Result ? Result - String : -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VerifyToken(int Type, char* TokenExpected) {
|
|
|
|
if(CurrentToken.type == Type)
|
|
|
|
Tokenise(&CurrentToken);
|
|
|
|
else {
|
|
|
|
printf("Expected %s on line %d\n", TokenExpected, Line);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct Token* RejectedToken = NULL;
|
|
|
|
|
|
|
|
void RejectToken(struct Token* Token) {
|
|
|
|
if(RejectedToken != NULL)
|
|
|
|
Die("Cannot reject two tokens in a row!");
|
|
|
|
|
|
|
|
RejectedToken = Token;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
|
|
|
* * * * L I T E R A L S A N D I D E N T I F I E R S * * * *
|
|
|
|
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
|
|
|
|
|
|
|
|
static int ReadInteger(int Char) {
|
|
|
|
int CurrentChar = 0;
|
|
|
|
int IntegerValue = 0;
|
|
|
|
|
|
|
|
while((CurrentChar = FindDigitFromPos("0123456789", Char)) >= 0) {
|
|
|
|
IntegerValue = IntegerValue * 10 + CurrentChar;
|
|
|
|
Char = NextChar();
|
|
|
|
}
|
|
|
|
|
|
|
|
ReturnCharToStream(Char);
|
|
|
|
|
|
|
|
return IntegerValue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Variable identifier, keyword, function.
|
|
|
|
static int ReadIdentifier(int Char, char* Buffer, int Limit) {
|
|
|
|
int ind = 0;
|
|
|
|
|
|
|
|
// This defines the valid chars in a keyword/variable/function.
|
|
|
|
while(isalpha(Char) || isdigit(Char) || Char == '_') {
|
|
|
|
if (ind >= Limit - 1) {
|
|
|
|
printf("Identifier too long: %d\n", Line);
|
|
|
|
exit(1);
|
|
|
|
} else {
|
|
|
|
Buffer[ind++] = Char;
|
|
|
|
}
|
|
|
|
|
|
|
|
Char = NextChar();
|
|
|
|
}
|
|
|
|
|
|
|
|
// At this point, we've reached a non-keyword character
|
|
|
|
ReturnCharToStream(Char);
|
|
|
|
Buffer[ind] = '\0';
|
|
|
|
return ind;
|
|
|
|
}
|
|
|
|
|
2020-11-22 00:41:48 +00:00
|
|
|
static int ReadCharLiteral() {
|
|
|
|
int Char;
|
|
|
|
Char = NextChar();
|
|
|
|
if(Char == '\\') {
|
|
|
|
switch(Char = NextChar()) {
|
|
|
|
case 'a': return '\a';
|
|
|
|
case 'b': return '\b';
|
|
|
|
case 'f': return '\f';
|
|
|
|
case 'n': return '\n';
|
|
|
|
case 'r': return '\r';
|
|
|
|
case 't': return '\t';
|
|
|
|
case 'v': return '\v';
|
|
|
|
case '\\': return '\\';
|
|
|
|
case '"': return '"';
|
|
|
|
case '\'': return '\'';
|
|
|
|
default:
|
2020-11-22 00:43:32 +00:00
|
|
|
DieChar("Unknown Escape: ", Char);
|
2020-11-22 00:41:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Char;
|
|
|
|
}
|
|
|
|
|
2020-11-22 01:44:54 +00:00
|
|
|
|
|
|
|
static int ReadStringLiteral(char* Buffer) {
|
|
|
|
int Char;
|
|
|
|
|
|
|
|
for(int i = 0; i < TEXTLEN - 1; i++) {
|
|
|
|
if((Char = ReadCharLiteral()) == '"') {
|
|
|
|
Buffer[i] = 0; return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
Buffer[i] = Char;
|
|
|
|
}
|
|
|
|
|
|
|
|
Die("String Literal Too Long");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-09-10 00:56:16 +00:00
|
|
|
/*
|
|
|
|
* This function is what defines the valid keywords for the language
|
|
|
|
* //TODO: move this to a static list?
|
|
|
|
* //TODO: More optimisations?
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
static int ReadKeyword(char* Str) {
|
|
|
|
// First, scan with reference intact.
|
|
|
|
switch(*Str) {
|
|
|
|
// This lets us case against the first char:
|
|
|
|
case ':':
|
|
|
|
if(!strcmp(Str, "::"))
|
|
|
|
return KW_FUNC;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 'c':
|
|
|
|
if(!strcmp(Str, "char"))
|
|
|
|
return TY_CHAR;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 'e':
|
|
|
|
if(!strcmp(Str, "else"))
|
|
|
|
return KW_ELSE;
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 'f':
|
|
|
|
if(!strcmp(Str, "for"))
|
|
|
|
return KW_FOR;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 'i':
|
|
|
|
|
|
|
|
if(!strcmp(Str, "int"))
|
|
|
|
return TY_INT;
|
|
|
|
|
|
|
|
if(!strcmp(Str, "if"))
|
|
|
|
return KW_IF;
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 'l':
|
|
|
|
if(!strcmp(Str, "long"))
|
|
|
|
return TY_LONG;
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 'p':
|
|
|
|
// This is a huge optimisation once we have as many keywords as a fully featured language.
|
|
|
|
if(!strcmp(Str, "print"))
|
|
|
|
return KW_PRINT;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 'r':
|
|
|
|
if(!strcmp(Str, "return"))
|
|
|
|
return KW_RETURN;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 'v':
|
|
|
|
if(!strcmp(Str, "void"))
|
|
|
|
return TY_VOID;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 'w':
|
|
|
|
if(!strcmp(Str, "while"))
|
|
|
|
return KW_WHILE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* * * * * * * * * * * * * * * * * * * * *
|
|
|
|
* * * * T O K E N I S E R * * * *
|
|
|
|
* * * * * * * * * * * * * * * * * * * * */
|
|
|
|
|
|
|
|
int Tokenise(struct Token* Token) {
|
|
|
|
int Char, TokenType;
|
|
|
|
|
|
|
|
if(RejectedToken != NULL) {
|
|
|
|
Token = RejectedToken;
|
|
|
|
RejectedToken = NULL;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
Char = FindChar();
|
|
|
|
|
|
|
|
switch(Char) {
|
|
|
|
case EOF:
|
|
|
|
Token->type = LI_EOF;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
case '+':
|
2020-11-23 21:42:32 +00:00
|
|
|
// + can be either "+" or "++".
|
|
|
|
Char = NextChar();
|
|
|
|
if(Char == '+') {
|
|
|
|
Token->type = PPMM_PLUS;
|
|
|
|
} else {
|
|
|
|
Token->type = AR_PLUS;
|
|
|
|
ReturnCharToStream(Char);
|
|
|
|
}
|
2020-09-10 00:56:16 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case '-':
|
2020-11-23 21:42:32 +00:00
|
|
|
// - can be either "-" or "--"
|
|
|
|
Char = NextChar();
|
|
|
|
if(Char == '-') {
|
|
|
|
Token->type = PPMM_MINUS;
|
|
|
|
} else {
|
|
|
|
Token->type = AR_MINUS;
|
|
|
|
ReturnCharToStream(Char);
|
|
|
|
}
|
2020-09-10 00:56:16 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case '*':
|
|
|
|
Token->type = AR_STAR;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '/':
|
|
|
|
Token->type = AR_SLASH;
|
|
|
|
break;
|
2020-09-13 01:26:49 +00:00
|
|
|
|
|
|
|
case '&':
|
2020-11-23 21:42:32 +00:00
|
|
|
Char = NextChar();
|
|
|
|
if(Char == '&') {
|
|
|
|
Token->type = BOOL_AND;
|
|
|
|
} else {
|
|
|
|
Token->type = BIT_AND;
|
|
|
|
ReturnCharToStream(Char);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '|':
|
|
|
|
Char = NextChar();
|
|
|
|
if(Char == '|') {
|
|
|
|
Token->type = BOOL_OR;
|
|
|
|
} else {
|
|
|
|
Token->type = BIT_OR;
|
|
|
|
ReturnCharToStream(Char);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '^':
|
|
|
|
Token->type = BIT_XOR;
|
2020-09-13 01:26:49 +00:00
|
|
|
break;
|
2020-09-10 00:56:16 +00:00
|
|
|
|
2020-11-23 21:42:32 +00:00
|
|
|
case '~':
|
|
|
|
Token->type = BIT_NOT;
|
|
|
|
break;
|
|
|
|
|
2020-09-13 22:41:46 +00:00
|
|
|
case ',':
|
|
|
|
Token->type = LI_COM;
|
|
|
|
break;
|
|
|
|
|
2020-09-10 00:56:16 +00:00
|
|
|
case '=':
|
|
|
|
Char = NextChar();
|
|
|
|
// If the next char is =, we have ==, the compare equality token.
|
|
|
|
if(Char == '?') {
|
|
|
|
Token->type = CMP_EQUAL;
|
|
|
|
// if the next char is >, we have =>, the greater than or equal token.
|
|
|
|
} else if(Char == '>') {
|
|
|
|
Token->type = CMP_GTE;
|
|
|
|
// If none of the above match, we have = and an extra char. Return the char and set the token
|
|
|
|
} else {
|
|
|
|
ReturnCharToStream(Char);
|
|
|
|
Token->type = LI_EQUAL;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '!':
|
|
|
|
Char = NextChar();
|
|
|
|
// If the next char is =, we have !=, the compare inequality operator.
|
|
|
|
if(Char == '=') {
|
|
|
|
Token->type = CMP_INEQ;
|
|
|
|
// Otherwise, we have a spare char
|
|
|
|
} else {
|
2020-11-23 21:42:32 +00:00
|
|
|
Token->type = BOOL_INVERT;
|
2020-09-10 00:56:16 +00:00
|
|
|
ReturnCharToStream(Char);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '<':
|
|
|
|
Char = NextChar();
|
|
|
|
// If the next char is =, we have <=, the less than or equal comparator.
|
|
|
|
if(Char == '=') {
|
|
|
|
Token->type = CMP_LTE;
|
2020-11-23 21:42:32 +00:00
|
|
|
} else if(Char == '<') { // But if the next char is <, we have << - the Shift Left operator.
|
|
|
|
Token->type = SH_LEFT;
|
2020-09-10 00:56:16 +00:00
|
|
|
} else {
|
|
|
|
ReturnCharToStream(Char);
|
|
|
|
Token->type = CMP_LT;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '>':
|
2020-11-23 21:42:32 +00:00
|
|
|
// For >, Less than or equal is => so we can ignore it, but the Shift Right operator is >>.
|
|
|
|
Char = NextChar();
|
|
|
|
if(Char == '>') {
|
|
|
|
Token->type = SH_RIGHT;
|
|
|
|
} else {
|
|
|
|
Token->type = CMP_GT;
|
|
|
|
ReturnCharToStream(Char);
|
|
|
|
}
|
2020-09-10 00:56:16 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case ';':
|
|
|
|
Token->type = LI_SEMIC;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '(':
|
|
|
|
Token->type = LI_LPARE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ')':
|
|
|
|
Token->type = LI_RPARE;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '{':
|
|
|
|
Token->type = LI_LBRAC;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '}':
|
|
|
|
Token->type = LI_RBRAC;
|
|
|
|
break;
|
|
|
|
|
2020-11-18 20:49:08 +00:00
|
|
|
case '[':
|
|
|
|
Token->type = LI_LBRAS;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ']':
|
|
|
|
Token->type = LI_RBRAS;
|
|
|
|
break;
|
|
|
|
|
2020-09-10 00:56:16 +00:00
|
|
|
case ':':
|
|
|
|
Char = NextChar();
|
|
|
|
|
|
|
|
if(Char == ':') {
|
|
|
|
Token->type = KW_FUNC;
|
|
|
|
} else {
|
|
|
|
ReturnCharToStream(Char);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2020-11-22 00:41:48 +00:00
|
|
|
case '\'':
|
|
|
|
Token->value = ReadCharLiteral();
|
|
|
|
Token->type = LI_INT;
|
|
|
|
|
|
|
|
if(NextChar() != '\'')
|
|
|
|
Die("Expected '\\'' at the end of a character.");
|
|
|
|
break;
|
|
|
|
|
2020-11-22 01:44:54 +00:00
|
|
|
case '"':
|
|
|
|
ReadStringLiteral(CurrentIdentifier);
|
|
|
|
Token->type = LI_STR;
|
|
|
|
break;
|
|
|
|
|
2020-09-10 00:56:16 +00:00
|
|
|
default:
|
|
|
|
if(isdigit(Char)) {
|
|
|
|
|
|
|
|
Token->value = ReadInteger(Char);
|
|
|
|
Token->type = LI_INT;
|
|
|
|
break;
|
|
|
|
|
|
|
|
} else if(isalpha(Char) || Char == '_') { // This is what defines what a variable/function/keyword can START with.
|
|
|
|
ReadIdentifier(Char, CurrentIdentifier, TEXTLEN);
|
|
|
|
|
|
|
|
if(TokenType = ReadKeyword(CurrentIdentifier)) {
|
|
|
|
Token->type = TokenType;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
Token->type = TY_IDENTIFIER;
|
|
|
|
break;
|
|
|
|
//printf("Line %d: Unrecognized symbol %s\n", CurrentIdentifier, Line);
|
|
|
|
//exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
DieChar("Unrecognized character", Char);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|