Erythro/src/Parser.c

519 lines
16 KiB
C

/*************/
/*GEMWIRE */
/* ERYTHRO*/
/*************/
#include <stdio.h>
#include <stdlib.h>
#include "Defs.h"
#include "Data.h"
#if defined(__GNUC__) || defined(APPLE)
#include <errno.h>
#endif
/*
* The Precedence of an operator is directly related to Token Type.
* Precedence determines how soon the operator and its surrounding values
* will be calculated and aliased.
* This allows for things like the common Order of Operations.
*/
static int Precedence[] = {
0, 10, // EOF, ASSIGN
20, 30, // || &&
40, 50, // | ^
60, 70, // & =?
70, 80, // != <
80, 80, // > <=
80, 90, // => <<
90, 100, // >> +
100, 110, // - *
110 // /
};
/*
* Handles gathering the precedence of an operator from its token,
* in terms of values of the TokenTypes enum.
*
* Error handling is also done here, so that EOF or non-operators are not executed.
*
*/
static int OperatorPrecedence(int Token) {
int Prec = Precedence[Token];
if (Prec == 0 || Token >= PPMM_PLUS) {
if (Token == TY_IDENTIFIER) {
ErrorReport("Attempting to determine operator precedence of identifier %s\n", CurrentIdentifier);
exit(1);
}
ErrorReport("Attempting to determine operator precedence of an EOF or INT literal: %s\n", TokenNames[Token]);
exit(1);
}
return Prec;
}
/*
* If the value is a right-expression, or in other words is right associative,
* then it can be safely calculated beforehand and aliased to a value.
* In this case, we can try to alias (or constant fold) everything on the right side
* of an assignment.
*/
static int IsRightExpr(int Token) {
return (Token == LI_EQUAL);
}
/* * * * * * * * * * * * * * * * * * * * * * * *
* * * N O D E C O N S T R U C T I O N * * *
* * * * * * * * * * * * * * * * * * * * * * * */
/*
* ASTNodes form the structure of the language that moves the bulk of
* data around within the compiler.
* They contain:
* * An Operation (usually 1:1 with an input token),
* * A Type (to identify the size of data it contains),
* * Two more Left and Right ASTNodes (to form a doubly-linked list)
* * An extra Middle ASTNode in case it is needed (typically in the middle case of a For loop)
* * A Symbol Table Entry
* * An Integer Value
* * A flag to determine whether this node (and its sub-nodes) contain a right associative or Rval
*
* This is the only function where they are constructed.
*
* @param Operation: The input Op of this Node, in terms of values of the SyntaxOps enum
* @param Type: The data type of this Node, in terms of values of the DataTypes enum.
* @param Left: The Node that is attached to the left side branch of this root.
* @param Middle: The Node that is attached to the middle of this root, if applicable.
* @param Right: The Node that is attached to the right side branch of this root.
* @param Symbol: The Symbol Table Entry that represents this Node, if applicable.
* @param IntValue: The integer value encoded by this Node, if applicable.
* @return a newly constructed AST Node
*/
struct ASTNode* ConstructASTNode(int Operation, int Type,
struct ASTNode* Left,
struct ASTNode* Middle,
struct ASTNode* Right,
struct SymbolTableEntry* Symbol,
int IntValue) {
struct ASTNode* Node;
Node = (struct ASTNode*) malloc(sizeof(struct ASTNode));
if (!Node)
ErrorReport("Unable to allocate node: %s\n", errno);
Node->Operation = Operation;
Node->ExprType = Type;
Node->Left = Left;
Node->Middle = Middle;
Node->Right = Right;
Node->Symbol = Symbol;
Node->IntValue = IntValue;
return Node;
}
/*
* AST Leaves are categorized by their lack of child nodes.
* @param Operation: The input Op of this Node, in terms of values of the SyntaxOps enum
* @param Type: The data type of this Node, in terms of values of the DataTypes enum.
* @param Symbol: The Symbol Table Entry that represents this Node, if applicable.
* @param IntValue: The integer value encoded by this Node, if applicable.
* @return a newly constructed AST Node
*/
struct ASTNode* ConstructASTLeaf(int Operation, int Type, struct SymbolTableEntry* Symbol, int IntValue) {
return ConstructASTNode(Operation, Type, NULL, NULL, NULL, Symbol, IntValue);
}
/*
* AST Branches are categorized by having only one child node.
* These are sometimes called Unary Branches.
* @param Operation: The input Op of this Node, in terms of values of the SyntaxOps enum
* @param Type: The data type of this Node, in terms of values of the DataTypes enum.
* @param Left: The Node that is attached to the left side branch of this root.
* @param Symbol: The Symbol Table Entry that represents this Node, if applicable.
* @param IntValue: The integer value encoded by this Node, if applicable.
* @return a newly constructed AST Node
*/
struct ASTNode* ConstructASTBranch(int Operation, int Type, struct ASTNode* Left, struct SymbolTableEntry* Symbol, int IntValue) {
return ConstructASTNode(Operation, Type, Left, NULL, NULL, Symbol, IntValue);
}
/* * * * * * * * * * * * * * * * * * * * * * * *
* * * * T O K E N P A R S I N G * * * *
* * * * * * * * * * * * * * * * * * * * * * * */
/*
* TokenTypes and SyntaxOps are mostly 1:1, so some minor effort can ensure that
* these are synchronized well.
* This allows the parsing operation to be little more than a bounds check.
* Otherwise, this would be a gigantic switch statement.
*/
int ParseTokenToOperation(int Token) {
if (Token > LI_EOF && Token < LI_INT)
return Token;
ErrorReport("ParseToken: Unknown token %s\n", TokenNames[Token]);
}
/*
* Primary expressions may be any one of:
* * A terminal integer literal
* * A terminal string literal
* * A variable
* * A collection of expressions bounded by parentheses.
*
* @return the AST Node that represents this expression
*/
struct ASTNode* ParsePrimary(void) {
struct ASTNode* Node;
int ID;
switch (CurrentFile->CurrentSymbol.type) {
case LI_INT:
if ((CurrentFile->CurrentSymbol.value >= 0) && (CurrentFile->CurrentSymbol.value < 256))
Node = ConstructASTLeaf(TERM_INTLITERAL, RET_CHAR, NULL, CurrentFile->CurrentSymbol.value);
else
Node = ConstructASTLeaf(TERM_INTLITERAL, RET_INT, NULL, CurrentFile->CurrentSymbol.value);
break;
case LI_STR:
ID = Assembler->vtable->AsNewString(CurrentIdentifier);
Node = ConstructASTLeaf(TERM_STRLITERAL, PointerTo(RET_CHAR), NULL, ID);
break;
case TY_IDENTIFIER:
return PostfixStatement();
case LI_LPARE:
// Starting a ( expr ) block
Tokenise();
Node = ParsePrecedenceASTNode(0);
// Make sure we close
VerifyToken(LI_RPARE, ")");
return Node;
}
Tokenise();
Safe();
return Node;
}
/*
* Parse a single binary expression.
* It ensures that these expressions are parsed to their full extent, that
* the order of operations is upheld, that the precedence of the prior
* iteration is considered, and that every error is handled.
*
* This is where all the right-associative statements are folded, where
* type mismatches and widening are handled properly, and that all parsing
* is over by the time the end tokens ") } ] ;" are encountered.
*
* @param PreviousTokenPrecedence: The precedence of the operator to the left.
* @return the AST Node corresponding to this block.
*
*/
struct ASTNode* ParsePrecedenceASTNode(int PreviousTokenPrecedence) {
struct ASTNode* LeftNode, * RightNode;
struct ASTNode* LeftTemp, * RightTemp;
// int LeftType, RightType;
int NodeType, OpType;
Safe();
LeftNode = PrefixStatement();
NodeType = CurrentFile->CurrentSymbol.type;
if (NodeType == LI_SEMIC || NodeType == LI_COLON || NodeType == LI_RPARE || NodeType == LI_RBRAS || NodeType == LI_COM || NodeType == LI_INT) {
LeftNode->RVal = 1;
return LeftNode;
}
Safe();
while ((OperatorPrecedence(NodeType) > PreviousTokenPrecedence) ||
(IsRightExpr(OpType) && OperatorPrecedence(OpType) == PreviousTokenPrecedence)) {
Tokenise();
if (CurrentFile->CurrentSymbol.type == LI_RPARE)
break;
Safe();
RightNode = ParsePrecedenceASTNode(Precedence[NodeType]);
Safe();
/**
* While parsing this node, we may need to widen some types.
* This requires a few functions and checks.
*/
OpType = ParseTokenToOperation(NodeType);
if (OpType == OP_ASSIGN) {
printf("\tParsePrecedenceASTNode: Assignment statement\r\n");
RightNode->RVal = 1;
LeftNode->RVal = 0;
RightNode = MutateType(RightNode, LeftNode->ExprType, 0);
if (RightNode == NULL)
ErrorReport("Incompatible types encountered in assignment: %s, %s\n", TypeNames(RightNode->ExprType), TypeNames(LeftNode->ExprType));
// LeftNode holds the target, the target variable in this case
printf("\t\tAssigning variable: %s\n", LeftNode->Symbol->Name);
LeftTemp = LeftNode;
LeftNode = RightNode;
RightNode = LeftTemp;
// Clear temps as insurance
RightTemp = NULL;
LeftTemp = NULL;
} else {
printf("\t\tAttempting to handle a %s in Binary Expression parsing\r\n", TokenNames[CurrentFile->CurrentSymbol.type]);
LeftNode->RVal = 1;
RightNode->RVal = 1;
LeftTemp = MutateType(LeftNode, RightNode->ExprType, OpType);
RightTemp = MutateType(RightNode, LeftNode->ExprType, OpType);
/**
* If both are null, the types are incompatible.
*/
if (LeftTemp == NULL && RightTemp == NULL)
ErrorReport("Incompatible types in parsing nodes: %s, %s\n", TypeNames(LeftNode->ExprType), TypeNames(RightNode->ExprType));
/**
* If the left was valid, or valid for
* expansion, then it will be non-null.
*
* If it was valid, then this will be
* equivalent to LeftNode = LeftNode
*/
if (LeftTemp != NULL)
LeftNode = LeftTemp;
/**
* Same here, but there is a higher chance
* for the right node to be incompatible due
* to the nature of widening types.
*/
if (RightTemp != NULL)
RightNode = RightTemp;
}
LeftNode = ConstructASTNode(ParseTokenToOperation(NodeType), LeftNode->ExprType, LeftNode, NULL, RightNode,
NULL, 0);
NodeType = CurrentFile->CurrentSymbol.type;
if (NodeType == LI_SEMIC || NodeType == LI_RPARE || NodeType == LI_RBRAS) {
LeftNode->RVal = 1;
return LeftNode;
}
}
LeftNode->RVal = 1;
return LeftNode;
}
/* * * * * * * * * * * * * * * * * * * * *
* * * * F U N C T I O N S * * * *
* * * * * * * * * * * * * * * * * * * * */
/*
* Handles the logic for calling a function.
* This is invoked by an identifier being recognized, followed by a "(.*)" string.
*
* It simply checks that the function exists, that the parameters given are valid,
* and generates the AST Node for calling it.
*
* @return the AST Node for calling the function stored in CurrentIdentifer
*
*/
struct ASTNode* CallFunction() {
struct ASTNode* Tree;
struct SymbolTableEntry* Function;
//TODO: Test structural type!
if ((Function = FindSymbol(CurrentIdentifier)) == NULL || (Function->Structure != ST_FUNC))
ErrorReport("Undeclared function: %s\n", CurrentIdentifier);
VerifyToken(LI_LPARE, "(");
Tree = ParseExpressionList(LI_RPARE);
Tree = ConstructASTBranch(OP_CALL, Function->Type, Tree, Function, 0);
VerifyToken(LI_RPARE, ")");
return Tree;
}
/*
* An expression list is used:
* * In the call to a function
*
* It is parsed by seeking left parentheses "(", parsing binary expressions
* until either a comma or a right parentheses is found.
*
* The former will cause another expression to be parsed, the latter will cause
* parsing to stop.
*
* @return the AST Node representing every expression in the list, glued end to
* end with a COMPOSITE operation.
*
*/
struct ASTNode* ParseExpressionList(int terminate) {
struct ASTNode* Tree = NULL, * Child = NULL;
int Count = 0;
Safe();
while (CurrentFile->CurrentSymbol.type != terminate) {
// TODO: for(int x = 0;
Child = ParsePrecedenceASTNode(0);
printf("\nFunction parameter %d is type %s.\n", Count, TypeNames(Child->ExprType));
Count++;
Tree = ConstructASTNode(OP_COMP, PointerTo(RET_VOID), Tree, NULL, Child, NULL, Count);
if (CurrentFile->CurrentSymbol.type == terminate)
break;
VerifyToken(LI_COM, ",");
Safe();
}
return Tree;
}
/* * * * * * * * * * * * * * * * * * * * * *
* * * * S T A T E M E N T S * * * *
* * * * * * * * * * * * * * * * * * * * * */
/*
* Handles parsing an individual statement.
* This has the ability to parse Compounds in the case it starts with a left brace.
* This solves the dangling else problem for the parser.
*
* It serves as a wrapper around:
* * If Statement
* * While Statement
* * For Statement
* * Switch Statement
* * Return Statement
* * Numeric literals and variables
* * Binary Expressions
*
* @return the AST Node representing this single statement
*/
struct ASTNode* ParseStatement(void) {
struct ASTNode* Node;
struct SymbolTableEntry* Composite;
printf("\t\tBranch leads to type %s/%d\r\n", TokenNames[CurrentFile->CurrentSymbol.type], CurrentFile->CurrentSymbol.type);
switch (CurrentFile->CurrentSymbol.type) {
case LI_LBRAC:
Tokenise();
Node = ParseCompound();
VerifyToken(LI_RBRAC, "}");
return Node;
case TY_IDENTIFIER:
if (FindAlias(CurrentIdentifier) == NULL) {
Node = ParsePrecedenceASTNode(0);
VerifyToken(LI_SEMIC, ";");
return Node;
}
case TY_CHAR:
case TY_LONG:
case TY_INT:
case KW_STRUCT:
case KW_UNION:
case KW_ENUM:
case KW_ALIAS:
printf("\t\tTrying new Variable: %s\n", CurrentIdentifier);
ParseDeclarationList(&Composite, SC_LOCAL, LI_SEMIC, LI_EOF, &Node);
VerifyToken(LI_SEMIC, ";");
Safe();
return NULL;
case KW_SWITCH:
return SwitchStatement();
case KW_IF:
return IfStatement();
case KW_WHILE:
return WhileStatement();
case KW_FOR:
return ForStatement();
case KW_RETURN:
return ReturnStatement();
case KW_BREAK:
return BreakStatement();
case KW_CONTINUE:
return ContinueStatement();
default:
Node = ParsePrecedenceASTNode(0);
VerifyToken(LI_SEMIC, ";");
return Node;
}
}
/*
* This is the entry point to the parser/lexer.
*
* By definition, Global definitions are accessible anywhere.
* As of right now (20/01/2021), classe are unimplemented.
* This means that all functions and all function prototypes are globally scoped.
*
* You may also define variables, constants, preprocessor directives and other text
* in the global scope.
*
* The function itself loops, parsing either variables or functions, until it
* reaches the end of the file.
*
*/
void ParseGlobals() {
struct SymbolTableEntry* Composite;
struct ASTNode* empty;
printf("Parsing global definitions\r\n");
while (CurrentFile->CurrentSymbol.type != LI_EOF) {
// Read in a declaration, or list thereof
ParseDeclarationList(&Composite, SC_GLOBAL, LI_SEMIC, LI_EOF, &empty);
// Consume semicolons if present
OptionallyConsume(LI_SEMIC);
}
}