/*************/ /*GEMWIRE */ /* ERYTHRO*/ /*************/ #include #include /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * C H A R S T R E AM * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ static void ReturnCharToStream(int Char) { Overread = Char; } static int NextChar(void) { int Char; if(Overread) { Char = Overread; Overread = 0; return Char; } Char = fgetc(SourceFile); if(Char == '\n') Line++; return Char; } static int FindChar() { int Char; Char = NextChar(); while(Char == ' ' || Char == '\t' || Char == '\n' || Char == '\r') { Char = NextChar(); } return Char; } static int FindDigitFromPos(char* String, char Char) { char* Result = strchr(String, Char); return(Result ? Result - String : -1); } void VerifyToken(int Type, char* TokenExpected) { if(CurrentToken.type == Type) Tokenise(&CurrentToken); else { printf("Expected %s on line %d\n", TokenExpected, Line); exit(1); } } static struct Token* RejectedToken = NULL; void RejectToken(struct Token* Token) { if(RejectedToken != NULL) Die("Cannot reject two tokens in a row!"); RejectedToken = Token; } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * L I T E R A L S A N D I D E N T I F I E R S * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ static int ReadInteger(int Char) { int CurrentChar = 0; int IntegerValue = 0; while((CurrentChar = FindDigitFromPos("0123456789", Char)) >= 0) { IntegerValue = IntegerValue * 10 + CurrentChar; Char = NextChar(); } ReturnCharToStream(Char); return IntegerValue; } // Variable identifier, keyword, function. static int ReadIdentifier(int Char, char* Buffer, int Limit) { int ind = 0; // This defines the valid chars in a keyword/variable/function. while(isalpha(Char) || isdigit(Char) || Char == '_') { if (ind >= Limit - 1) { printf("Identifier too long: %d\n", Line); exit(1); } else { Buffer[ind++] = Char; } Char = NextChar(); } // At this point, we've reached a non-keyword character ReturnCharToStream(Char); Buffer[ind] = '\0'; return ind; } static int ReadCharLiteral() { int Char; Char = NextChar(); if(Char == '\\') { switch(Char = NextChar()) { case 'a': return '\a'; case 'b': return '\b'; case 'f': return '\f'; case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; case 'v': return '\v'; case '\\': return '\\'; case '"': return '"'; case '\'': return '\''; default: DieChar("Unknown Escape: ", Char); } } return Char; } static int ReadStringLiteral(char* Buffer) { int Char; for(int i = 0; i < TEXTLEN - 1; i++) { if((Char = ReadCharLiteral()) == '"') { Buffer[i] = 0; return i; } Buffer[i] = Char; } Die("String Literal Too Long"); return 0; } /* * This function is what defines the valid keywords for the language * //TODO: move this to a static list? * //TODO: More optimisations? * */ static int ReadKeyword(char* Str) { // First, scan with reference intact. switch(*Str) { // This lets us case against the first char: case ':': if(!strcmp(Str, "::")) return KW_FUNC; break; case 'c': if(!strcmp(Str, "char")) return TY_CHAR; break; case 'e': if(!strcmp(Str, "else")) return KW_ELSE; break; case 'f': if(!strcmp(Str, "for")) return KW_FOR; break; case 'i': if(!strcmp(Str, "int")) return TY_INT; if(!strcmp(Str, "if")) return KW_IF; break; case 'l': if(!strcmp(Str, "long")) return TY_LONG; break; case 'p': // This is a huge optimisation once we have as many keywords as a fully featured language. if(!strcmp(Str, "print")) return KW_PRINT; break; case 'r': if(!strcmp(Str, "return")) return KW_RETURN; break; case 'v': if(!strcmp(Str, "void")) return TY_VOID; break; case 'w': if(!strcmp(Str, "while")) return KW_WHILE; break; } return 0; } /* * * * * * * * * * * * * * * * * * * * * * * * * T O K E N I S E R * * * * * * * * * * * * * * * * * * * * * * * * */ int Tokenise(struct Token* Token) { int Char, TokenType; if(RejectedToken != NULL) { Token = RejectedToken; RejectedToken = NULL; return 1; } Char = FindChar(); switch(Char) { case EOF: Token->type = LI_EOF; return 0; case '+': // + can be either "+" or "++". Char = NextChar(); if(Char == '+') { Token->type = PPMM_PLUS; } else { Token->type = AR_PLUS; ReturnCharToStream(Char); } break; case '-': // - can be either "-" or "--" Char = NextChar(); if(Char == '-') { Token->type = PPMM_MINUS; } else { Token->type = AR_MINUS; ReturnCharToStream(Char); } break; case '*': Token->type = AR_STAR; break; case '/': Token->type = AR_SLASH; break; case '&': Char = NextChar(); if(Char == '&') { Token->type = BOOL_AND; } else { Token->type = BIT_AND; ReturnCharToStream(Char); } break; case '|': Char = NextChar(); if(Char == '|') { Token->type = BOOL_OR; } else { Token->type = BIT_OR; ReturnCharToStream(Char); } break; case '^': Token->type = BIT_XOR; break; case '~': Token->type = BIT_NOT; break; case ',': Token->type = LI_COM; break; case '=': Char = NextChar(); // If the next char is =, we have ==, the compare equality token. if(Char == '?') { Token->type = CMP_EQUAL; // if the next char is >, we have =>, the greater than or equal token. } else if(Char == '>') { Token->type = CMP_GTE; // If none of the above match, we have = and an extra char. Return the char and set the token } else { ReturnCharToStream(Char); Token->type = LI_EQUAL; } break; case '!': Char = NextChar(); // If the next char is =, we have !=, the compare inequality operator. if(Char == '=') { Token->type = CMP_INEQ; // Otherwise, we have a spare char } else { Token->type = BOOL_INVERT; ReturnCharToStream(Char); } break; case '<': Char = NextChar(); // If the next char is =, we have <=, the less than or equal comparator. if(Char == '=') { Token->type = CMP_LTE; } else if(Char == '<') { // But if the next char is <, we have << - the Shift Left operator. Token->type = SH_LEFT; } else { ReturnCharToStream(Char); Token->type = CMP_LT; } break; case '>': // For >, Less than or equal is => so we can ignore it, but the Shift Right operator is >>. Char = NextChar(); if(Char == '>') { Token->type = SH_RIGHT; } else { Token->type = CMP_GT; ReturnCharToStream(Char); } break; case ';': Token->type = LI_SEMIC; break; case '(': Token->type = LI_LPARE; break; case ')': Token->type = LI_RPARE; break; case '{': Token->type = LI_LBRAC; break; case '}': Token->type = LI_RBRAC; break; case '[': Token->type = LI_LBRAS; break; case ']': Token->type = LI_RBRAS; break; case ':': Char = NextChar(); if(Char == ':') { Token->type = KW_FUNC; } else { ReturnCharToStream(Char); } break; case '\'': Token->value = ReadCharLiteral(); Token->type = LI_INT; if(NextChar() != '\'') Die("Expected '\\'' at the end of a character."); break; case '"': ReadStringLiteral(CurrentIdentifier); Token->type = LI_STR; break; default: if(isdigit(Char)) { Token->value = ReadInteger(Char); Token->type = LI_INT; break; } else if(isalpha(Char) || Char == '_') { // This is what defines what a variable/function/keyword can START with. ReadIdentifier(Char, CurrentIdentifier, TEXTLEN); if(TokenType = ReadKeyword(CurrentIdentifier)) { Token->type = TokenType; break; } Token->type = TY_IDENTIFIER; break; //printf("Line %d: Unrecognized symbol %s\n", CurrentIdentifier, Line); //exit(1); } DieChar("Unrecognized character", Char); } return 1; }